454 lines
21 KiB
C#
454 lines
21 KiB
C#
using CPRNIMS.Infrastructure.Dto.Canvass.Response;
|
|
using CPRNIMS.Infrastructure.Dto.Canvass.Result;
|
|
using Microsoft.Extensions.Configuration;
|
|
using System;
|
|
using System.Collections.Generic;
|
|
using System.Globalization;
|
|
using System.Linq;
|
|
using System.Text;
|
|
using System.Text.Json;
|
|
using System.Text.Json.Serialization;
|
|
using System.Text.RegularExpressions;
|
|
using System.Threading.Tasks;
|
|
|
|
namespace CPRNIMS.Domain.Services.Canvass
|
|
{
|
|
public class SupplierSearchService
|
|
{
|
|
private readonly HttpClient _httpClient;
|
|
private readonly IConfiguration _config;
|
|
|
|
// Common contact page suffixes to try
|
|
private static readonly string[] ContactPaths =
|
|
{ "/contact", "/contact-us", "/pages/contact-us", "/about/contact", "/about" };
|
|
/// <summary>
|
|
/// Uses Groq to fuzzy-match a new supplier against existing ones.
|
|
/// Handles rebranding, spacing in phone numbers, name variations, etc.
|
|
/// Returns the matched existing SupplierId, or null if no match.
|
|
/// </summary>
|
|
public async Task<int?> FindMatchingExistingSupplierAsync(
|
|
SupplierResponse incoming,
|
|
List<SupplierResponse> existingSuppliers)
|
|
{
|
|
if (!existingSuppliers.Any()) return null;
|
|
|
|
// ── Layer 1: Exact C# match (fast, free, no API call) ──────────────
|
|
var incomingEmail = (incoming.EmailAddress ?? "").Trim().ToLower();
|
|
var incomingPhone = NormalizePhone(incoming.ContactNo ?? "");
|
|
var incomingDomain = ExtractDomain(incoming.Website ?? "");
|
|
|
|
foreach (var s in existingSuppliers)
|
|
{
|
|
var existEmail = (s.EmailAddress ?? "").Trim().ToLower();
|
|
var existPhone = NormalizePhone(s.ContactNo ?? "");
|
|
var existDomain = ExtractDomain(s.Website ?? "");
|
|
|
|
if (!string.IsNullOrEmpty(incomingEmail) && incomingEmail == existEmail)
|
|
return s.SupplierId;
|
|
|
|
if (!string.IsNullOrEmpty(incomingPhone) && incomingPhone == existPhone)
|
|
return s.SupplierId;
|
|
|
|
if (!string.IsNullOrEmpty(incomingDomain) && incomingDomain == existDomain)
|
|
return s.SupplierId;
|
|
}
|
|
|
|
// ── Layer 2: Fuzzy C# pre-filter — narrow to top candidates ────────
|
|
var incomingName = (incoming.SupplierName ?? "").ToLower();
|
|
|
|
var candidates = existingSuppliers
|
|
.Where(s =>
|
|
{
|
|
var name = (s.SupplierName ?? "").ToLower();
|
|
|
|
// Keep if first word matches (e.g. "Linde" in "Linde PH" vs "Linde Philippines")
|
|
var incomingFirstWord = incomingName.Split(' ').FirstOrDefault() ?? "";
|
|
var existFirstWord = name.Split(' ').FirstOrDefault() ?? "";
|
|
|
|
return !string.IsNullOrEmpty(incomingFirstWord)
|
|
&& incomingFirstWord.Length > 2 // ignore short words like "co", "ph"
|
|
&& existFirstWord.StartsWith(incomingFirstWord, StringComparison.OrdinalIgnoreCase);
|
|
})
|
|
.Take(5) // max 5 candidates to Groq — well within token limit
|
|
.Select(s => new
|
|
{
|
|
s.SupplierId,
|
|
s.SupplierName,
|
|
s.EmailAddress,
|
|
s.ContactNo,
|
|
s.Website
|
|
})
|
|
.ToList();
|
|
|
|
// No fuzzy candidates found — it's a new supplier
|
|
if (!candidates.Any()) return null;
|
|
|
|
// ── Layer 3: Groq fuzzy match — only on small candidate list ────────
|
|
var incomingJson = JsonSerializer.Serialize(new
|
|
{
|
|
incoming.SupplierName,
|
|
incoming.EmailAddress,
|
|
incoming.ContactNo,
|
|
incoming.Website
|
|
});
|
|
|
|
var candidatesJson = JsonSerializer.Serialize(candidates);
|
|
|
|
var prompt =
|
|
"TASK: Determine if the INCOMING supplier already exists in the CANDIDATES list.\n\n" +
|
|
"MATCHING RULES (any one is enough):\n" +
|
|
"1. Same email address (case-insensitive).\n" +
|
|
"2. Same phone number after stripping spaces, dashes, country codes.\n" +
|
|
"3. Same company despite rebranding, abbreviation, or spacing differences.\n" +
|
|
"4. Same website domain (ignore www, http/https).\n\n" +
|
|
"If matched: respond ONLY { \"matched\": true, \"supplierId\": <number> }\n" +
|
|
"If not matched: respond ONLY { \"matched\": false, \"supplierId\": null }\n" +
|
|
"No explanation. No markdown. JSON only.\n\n" +
|
|
$"INCOMING:\n{incomingJson}\n\n" +
|
|
$"CANDIDATES:\n{candidatesJson}";
|
|
|
|
var payload = new
|
|
{
|
|
model = _config["Groq:Model"] ?? "llama-3.1-8b-instant",
|
|
stream = false,
|
|
max_tokens = 50,
|
|
temperature = 0,
|
|
messages = new[]
|
|
{
|
|
new { role = "system", content = "You are a supplier deduplication engine. Return ONLY valid JSON. No markdown. No explanation." },
|
|
new { role = "user", content = prompt }
|
|
}
|
|
};
|
|
|
|
var request = new HttpRequestMessage(HttpMethod.Post, _config["Groq:ApiUrl"]);
|
|
request.Headers.Add("Authorization", $"Bearer {_config["Groq:ApiKey"]}");
|
|
request.Content = new StringContent(
|
|
JsonSerializer.Serialize(payload), Encoding.UTF8, "application/json");
|
|
|
|
var response = await _httpClient.SendAsync(request);
|
|
response.EnsureSuccessStatusCode();
|
|
|
|
var body = await response.Content.ReadAsStringAsync();
|
|
var groqResp = JsonSerializer.Deserialize<GroqResponse>(body,
|
|
new JsonSerializerOptions { PropertyNameCaseInsensitive = true });
|
|
|
|
var rawText = groqResp?.Choices?[0]?.Message?.Content ?? string.Empty;
|
|
rawText = Regex.Replace(rawText, @"```[a-z]*|```", "").Trim();
|
|
|
|
var match = JsonSerializer.Deserialize<GroqMatchResult>(rawText,
|
|
new JsonSerializerOptions { PropertyNameCaseInsensitive = true });
|
|
|
|
return match?.Matched == true ? match.SupplierId : null;
|
|
}
|
|
public SupplierSearchService(HttpClient httpClient, IConfiguration config)
|
|
{
|
|
_httpClient = httpClient;
|
|
_config = config;
|
|
}
|
|
|
|
public async Task<List<SupplierResponse>> SearchAndFilterSuppliersAsync(
|
|
string itemName, string itemDescription, bool isInternational)
|
|
{
|
|
var locality = isInternational
|
|
? "all over Asia including Philippines"
|
|
: "Philippines";
|
|
|
|
// Step 1: Tavily — get supplier URLs
|
|
var (searchContent, supplierUrls) = await SearchTavilyAsync(itemName, itemDescription, locality);
|
|
|
|
// Step 2: Fetch contact pages from discovered URLs
|
|
var contactContent = await FetchContactPagesAsync(supplierUrls);
|
|
|
|
// Step 3: Combine search + contact content, send to Groq
|
|
var combined = searchContent + " CONTACT_PAGES_DATA: " + contactContent;
|
|
var suppliers = await FilterWithGroqAsync(itemName, itemDescription, combined,isInternational);
|
|
|
|
return suppliers;
|
|
}
|
|
|
|
// ── Tavily ──
|
|
private async Task<(string content, List<string> urls)> SearchTavilyAsync(
|
|
string itemName, string itemDescription,string locality)
|
|
{
|
|
try
|
|
{
|
|
var query = $"{itemName} {itemDescription} suppliers {locality} budget price contact email phone";
|
|
|
|
var payload = new
|
|
{
|
|
query,
|
|
max_results = 10,
|
|
search_depth = "advanced",
|
|
include_answer = false
|
|
};
|
|
|
|
var request = new HttpRequestMessage(HttpMethod.Post, _config["Tavily:SearchUrl"]);
|
|
request.Headers.Add("Authorization", $"Bearer {_config["Tavily:ApiKey"]}");
|
|
request.Content = new StringContent(
|
|
JsonSerializer.Serialize(payload), Encoding.UTF8, "application/json");
|
|
|
|
var response = await _httpClient.SendAsync(request);
|
|
response.EnsureSuccessStatusCode();
|
|
|
|
var body = await response.Content.ReadAsStringAsync();
|
|
var result = JsonSerializer.Deserialize<TavilySearchResult>(body,
|
|
new JsonSerializerOptions { PropertyNameCaseInsensitive = true });
|
|
|
|
var sb = new StringBuilder();
|
|
var urls = new List<string>();
|
|
int i = 1;
|
|
|
|
foreach (var r in result?.Results ?? new())
|
|
{
|
|
// Clean text
|
|
var clean = Regex.Replace(r.Content ?? "", @"[^\x20-\x7E]", " ");
|
|
clean = Regex.Replace(clean, @"\s{3,}", " ");
|
|
if (clean.Length > 300) clean = clean[..300];
|
|
sb.Append($"{i}. Title:{r.Title}|URL:{r.Url}|Content:{clean}|");
|
|
|
|
// Collect base domain URLs for contact page fetching
|
|
try
|
|
{
|
|
var uri = new Uri(r.Url);
|
|
var baseUrl = $"{uri.Scheme}://{uri.Host}";
|
|
if (!urls.Contains(baseUrl)) urls.Add(baseUrl);
|
|
}
|
|
catch { }
|
|
i++;
|
|
}
|
|
|
|
var fullText = sb.ToString();
|
|
if (fullText.Length > 2000) fullText = fullText[..2000];
|
|
|
|
return (fullText, urls);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
ex.ToString();
|
|
throw;
|
|
}
|
|
|
|
}
|
|
|
|
// ── Fetch Contact Pages ───
|
|
private async Task<string> FetchContactPagesAsync(List<string> baseUrls)
|
|
{
|
|
var sb = new StringBuilder();
|
|
// Limit to top 5 domains to avoid timeout
|
|
foreach (var baseUrl in baseUrls.Take(5))
|
|
{
|
|
foreach (var path in ContactPaths)
|
|
{
|
|
try
|
|
{
|
|
var url = baseUrl + path;
|
|
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(5));
|
|
var resp = await _httpClient.GetAsync(url, cts.Token);
|
|
|
|
if (!resp.IsSuccessStatusCode) continue;
|
|
|
|
var html = await resp.Content.ReadAsStringAsync();
|
|
|
|
// Strip HTML tags
|
|
var text = Regex.Replace(html, @"<[^>]+>", " ");
|
|
text = Regex.Replace(text, @"[^\x20-\x7E]", " ");
|
|
text = Regex.Replace(text, @"\s{3,}", " ");
|
|
|
|
// Extract only lines with contact signals
|
|
var contactLines = text.Split(' ', StringSplitOptions.RemoveEmptyEntries)
|
|
.Where(w => w.Contains("@")
|
|
|| Regex.IsMatch(w, @"\+?[\d\-\(\)]{7,}")
|
|
|| w.Contains("email")
|
|
|| w.Contains("phone")
|
|
|| w.Contains("contact")
|
|
|| w.Contains("mobile")
|
|
|| w.Contains("tel"))
|
|
.Take(50);
|
|
|
|
var contactText = string.Join(" ", contactLines);
|
|
if (!string.IsNullOrWhiteSpace(contactText))
|
|
{
|
|
sb.Append($"[{baseUrl}]: {contactText} | ");
|
|
break; // Found contact page for this domain, move to next
|
|
}
|
|
}
|
|
catch { /* timeout or unreachable — skip */ }
|
|
}
|
|
}
|
|
|
|
return sb.Length > 2000 ? sb.ToString()[..2000] : sb.ToString();
|
|
}
|
|
|
|
// ── Groq ─────────────────────────────────────────────────────────────────
|
|
private async Task<List<SupplierResponse>> FilterWithGroqAsync(
|
|
string itemName, string itemDescription, string searchContent, bool isInternational)
|
|
{
|
|
try
|
|
{
|
|
var localityRule = isInternational
|
|
? "1. Include suppliers from Philippines first, then other Asian countries (e.g. China, Japan, South Korea, Taiwan, India, Singapore).\n"
|
|
: "1. STRICT: Include ONLY suppliers based in the Philippines. Exclude ANY supplier from other countries — even if they ship to Philippines. If a supplier's country is not Philippines, skip it entirely.\n";
|
|
|
|
var prompt =
|
|
$"TASK: Extract up to 10 unique suppliers that sell: [{itemName}] — {itemDescription}.\n\n" +
|
|
"RULES:\n" +
|
|
localityRule +
|
|
"2. Prefer budget-friendly suppliers with known pricing.\n" +
|
|
"3. DEDUPLICATION (strict): Each entry must have a unique company_name, contact_email, AND phone_number.\n" +
|
|
" - If two entries share the same email OR phone number, keep only the first.\n" +
|
|
" - If two inferred emails resolve to the same address, keep only one.\n" +
|
|
"4. CONTACT EXTRACTION:\n" +
|
|
" - Look in the CONTACT_PAGES_DATA section for real emails and phone numbers.\n" +
|
|
" - Use exact values found. Do not fabricate contact details.\n" +
|
|
" - If no email is found for a domain, infer: sales@domain.com or info@domain.com.\n" +
|
|
" - If no phone is found, use null — do not guess.\n" +
|
|
"5. estimated_price_usd MUST be a number (e.g. 12.50) or null. NEVER a string.\n" +
|
|
"6. Exclude any supplier with no company_name or no contact_email.\n\n" +
|
|
|
|
"OUTPUT FORMAT:\n" +
|
|
"Return ONLY a valid raw JSON array. No markdown. No explanation. No extra text.\n" +
|
|
"Each object must have exactly these fields:\n" +
|
|
" company_name (string)\n" +
|
|
" country (string)\n" +
|
|
" phone_number (string | null)\n" +
|
|
" contact_email (string | null)\n" +
|
|
" website (string | null)\n" +
|
|
" estimated_price_usd (number | null)\n" +
|
|
" item_specifications (string | null)\n\n" +
|
|
|
|
$"DATA:\n{searchContent}";
|
|
|
|
var payload = new
|
|
{
|
|
model = _config["Groq:Model"] ?? "llama-3.1-8b-instant",
|
|
stream = false,
|
|
max_tokens = 2048,
|
|
temperature = 0.1,
|
|
messages = new[]
|
|
{
|
|
new { role = "system", content = "You are a supplier data extractor. Extract real contact details from provided content. Return ONLY a valid JSON array, no markdown, no explanation." },
|
|
new { role = "user", content = prompt }
|
|
}
|
|
};
|
|
|
|
var request = new HttpRequestMessage(HttpMethod.Post, _config["Groq:ApiUrl"]);
|
|
request.Headers.Add("Authorization", $"Bearer {_config["Groq:ApiKey"]}");
|
|
request.Content = new StringContent(
|
|
JsonSerializer.Serialize(payload), Encoding.UTF8, "application/json");
|
|
|
|
var response = await _httpClient.SendAsync(request);
|
|
response.EnsureSuccessStatusCode();
|
|
|
|
var body = await response.Content.ReadAsStringAsync();
|
|
var groqResp = JsonSerializer.Deserialize<GroqResponse>(body,
|
|
new JsonSerializerOptions { PropertyNameCaseInsensitive = true });
|
|
|
|
var rawText = groqResp?.Choices?[0]?.Message?.Content ?? string.Empty;
|
|
|
|
var match = Regex.Match(rawText, @"\[[\s\S]*\]");
|
|
if (!match.Success) return new List<SupplierResponse>();
|
|
|
|
// Add the converter to the shared options
|
|
var jsonOptions = new JsonSerializerOptions
|
|
{
|
|
PropertyNameCaseInsensitive = true,
|
|
Converters = { new FlexibleDecimalConverter() }
|
|
};
|
|
|
|
var groqList = JsonSerializer.Deserialize<List<GroqSupplierResult>>(match.Value, jsonOptions)
|
|
?? new List<GroqSupplierResult>();
|
|
|
|
var seenNames = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
|
var seenEmails = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
|
var seenPhones = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
|
var suppliers = new List<SupplierResponse>();
|
|
var allowedCountries = isInternational
|
|
? new HashSet<string>(StringComparer.OrdinalIgnoreCase)
|
|
{
|
|
"Philippines", "China", "Japan", "South Korea", "Taiwan",
|
|
"India", "Singapore", "Malaysia", "Thailand", "Vietnam",
|
|
"Indonesia", "Hong Kong"
|
|
}
|
|
: new HashSet<string>(StringComparer.OrdinalIgnoreCase)
|
|
{
|
|
"Philippines"
|
|
};
|
|
|
|
foreach (var s in groqList)
|
|
{
|
|
var key = (s.CompanyName ?? "").Trim().ToLower();
|
|
var email = (s.ContactEmail ?? "").Trim().ToLower();
|
|
var phone = NormalizePhone(s.PhoneNumber ?? "");
|
|
|
|
// Skip if no company name
|
|
if (string.IsNullOrEmpty(key)) continue;
|
|
|
|
// Skip if no email
|
|
if (string.IsNullOrEmpty(email)) continue;
|
|
|
|
// ✅ Skip if company name, email, OR phone already seen
|
|
if (seenNames.Contains(key)) continue;
|
|
if (seenEmails.Contains(email)) continue;
|
|
if (!string.IsNullOrEmpty(phone) && seenPhones.Contains(phone)) continue;
|
|
|
|
seenNames.Add(key);
|
|
seenEmails.Add(email);
|
|
if (!string.IsNullOrEmpty(phone)) seenPhones.Add(phone);
|
|
|
|
suppliers.Add(new SupplierResponse
|
|
{
|
|
SupplierName = s.CompanyName,
|
|
EmailAddress = s.ContactEmail,
|
|
ContactNo = s.PhoneNumber ?? string.Empty,
|
|
Address = s.Country ?? string.Empty,
|
|
IsActive = true,
|
|
VatInc = false,
|
|
Currency = "PHP",
|
|
CurrencyId = 1,
|
|
PaymentTermsId = 1,
|
|
PaymentTerms = "30 Days",
|
|
LeadTime = "7-14 Days",
|
|
TinNo = string.Empty,
|
|
ContactPerson = string.Empty,
|
|
Website = s.Website ?? string.Empty,
|
|
});
|
|
|
|
if (suppliers.Count >= 10) break;
|
|
}
|
|
|
|
return suppliers;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
ex.ToString();
|
|
throw;
|
|
}
|
|
|
|
}
|
|
private static string NormalizePhone(string phone)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(phone)) return string.Empty;
|
|
|
|
// Strip everything except digits
|
|
var digits = Regex.Replace(phone, @"\D", "");
|
|
|
|
// Remove leading country code "1" for US/CA numbers (11 digits starting with 1)
|
|
if (digits.Length == 11 && digits.StartsWith("1"))
|
|
digits = digits[1..];
|
|
|
|
return digits;
|
|
}
|
|
private static string ExtractDomain(string url)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(url)) return string.Empty;
|
|
try
|
|
{
|
|
if (!url.StartsWith("http")) url = "https://" + url;
|
|
var host = new Uri(url).Host;
|
|
return host.StartsWith("www.") ? host[4..] : host;
|
|
}
|
|
catch { return string.Empty; }
|
|
}
|
|
}
|
|
}
|