NonInventPurchasingSystem/CPRNIMS.Domain/Services/Canvass/SupplierSearchService.cs

454 lines
21 KiB
C#

using CPRNIMS.Infrastructure.Dto.Canvass.Response;
using CPRNIMS.Infrastructure.Dto.Canvass.Result;
using Microsoft.Extensions.Configuration;
using System;
using System.Collections.Generic;
using System.Globalization;
using System.Linq;
using System.Text;
using System.Text.Json;
using System.Text.Json.Serialization;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
namespace CPRNIMS.Domain.Services.Canvass
{
public class SupplierSearchService
{
private readonly HttpClient _httpClient;
private readonly IConfiguration _config;
// Common contact page suffixes to try
private static readonly string[] ContactPaths =
{ "/contact", "/contact-us", "/pages/contact-us", "/about/contact", "/about" };
/// <summary>
/// Uses Groq to fuzzy-match a new supplier against existing ones.
/// Handles rebranding, spacing in phone numbers, name variations, etc.
/// Returns the matched existing SupplierId, or null if no match.
/// </summary>
public async Task<int?> FindMatchingExistingSupplierAsync(
SupplierResponse incoming,
List<SupplierResponse> existingSuppliers)
{
if (!existingSuppliers.Any()) return null;
// ── Layer 1: Exact C# match (fast, free, no API call) ──────────────
var incomingEmail = (incoming.EmailAddress ?? "").Trim().ToLower();
var incomingPhone = NormalizePhone(incoming.ContactNo ?? "");
var incomingDomain = ExtractDomain(incoming.Website ?? "");
foreach (var s in existingSuppliers)
{
var existEmail = (s.EmailAddress ?? "").Trim().ToLower();
var existPhone = NormalizePhone(s.ContactNo ?? "");
var existDomain = ExtractDomain(s.Website ?? "");
if (!string.IsNullOrEmpty(incomingEmail) && incomingEmail == existEmail)
return s.SupplierId;
if (!string.IsNullOrEmpty(incomingPhone) && incomingPhone == existPhone)
return s.SupplierId;
if (!string.IsNullOrEmpty(incomingDomain) && incomingDomain == existDomain)
return s.SupplierId;
}
// ── Layer 2: Fuzzy C# pre-filter — narrow to top candidates ────────
var incomingName = (incoming.SupplierName ?? "").ToLower();
var candidates = existingSuppliers
.Where(s =>
{
var name = (s.SupplierName ?? "").ToLower();
// Keep if first word matches (e.g. "Linde" in "Linde PH" vs "Linde Philippines")
var incomingFirstWord = incomingName.Split(' ').FirstOrDefault() ?? "";
var existFirstWord = name.Split(' ').FirstOrDefault() ?? "";
return !string.IsNullOrEmpty(incomingFirstWord)
&& incomingFirstWord.Length > 2 // ignore short words like "co", "ph"
&& existFirstWord.StartsWith(incomingFirstWord, StringComparison.OrdinalIgnoreCase);
})
.Take(5) // max 5 candidates to Groq — well within token limit
.Select(s => new
{
s.SupplierId,
s.SupplierName,
s.EmailAddress,
s.ContactNo,
s.Website
})
.ToList();
// No fuzzy candidates found — it's a new supplier
if (!candidates.Any()) return null;
// ── Layer 3: Groq fuzzy match — only on small candidate list ────────
var incomingJson = JsonSerializer.Serialize(new
{
incoming.SupplierName,
incoming.EmailAddress,
incoming.ContactNo,
incoming.Website
});
var candidatesJson = JsonSerializer.Serialize(candidates);
var prompt =
"TASK: Determine if the INCOMING supplier already exists in the CANDIDATES list.\n\n" +
"MATCHING RULES (any one is enough):\n" +
"1. Same email address (case-insensitive).\n" +
"2. Same phone number after stripping spaces, dashes, country codes.\n" +
"3. Same company despite rebranding, abbreviation, or spacing differences.\n" +
"4. Same website domain (ignore www, http/https).\n\n" +
"If matched: respond ONLY { \"matched\": true, \"supplierId\": <number> }\n" +
"If not matched: respond ONLY { \"matched\": false, \"supplierId\": null }\n" +
"No explanation. No markdown. JSON only.\n\n" +
$"INCOMING:\n{incomingJson}\n\n" +
$"CANDIDATES:\n{candidatesJson}";
var payload = new
{
model = _config["Groq:Model"] ?? "llama-3.1-8b-instant",
stream = false,
max_tokens = 50,
temperature = 0,
messages = new[]
{
new { role = "system", content = "You are a supplier deduplication engine. Return ONLY valid JSON. No markdown. No explanation." },
new { role = "user", content = prompt }
}
};
var request = new HttpRequestMessage(HttpMethod.Post, _config["Groq:ApiUrl"]);
request.Headers.Add("Authorization", $"Bearer {_config["Groq:ApiKey"]}");
request.Content = new StringContent(
JsonSerializer.Serialize(payload), Encoding.UTF8, "application/json");
var response = await _httpClient.SendAsync(request);
response.EnsureSuccessStatusCode();
var body = await response.Content.ReadAsStringAsync();
var groqResp = JsonSerializer.Deserialize<GroqResponse>(body,
new JsonSerializerOptions { PropertyNameCaseInsensitive = true });
var rawText = groqResp?.Choices?[0]?.Message?.Content ?? string.Empty;
rawText = Regex.Replace(rawText, @"```[a-z]*|```", "").Trim();
var match = JsonSerializer.Deserialize<GroqMatchResult>(rawText,
new JsonSerializerOptions { PropertyNameCaseInsensitive = true });
return match?.Matched == true ? match.SupplierId : null;
}
public SupplierSearchService(HttpClient httpClient, IConfiguration config)
{
_httpClient = httpClient;
_config = config;
}
public async Task<List<SupplierResponse>> SearchAndFilterSuppliersAsync(
string itemName, string itemDescription, bool isInternational)
{
var locality = isInternational
? "all over Asia including Philippines"
: "Philippines";
// Step 1: Tavily — get supplier URLs
var (searchContent, supplierUrls) = await SearchTavilyAsync(itemName, itemDescription, locality);
// Step 2: Fetch contact pages from discovered URLs
var contactContent = await FetchContactPagesAsync(supplierUrls);
// Step 3: Combine search + contact content, send to Groq
var combined = searchContent + " CONTACT_PAGES_DATA: " + contactContent;
var suppliers = await FilterWithGroqAsync(itemName, itemDescription, combined,isInternational);
return suppliers;
}
// ── Tavily ──
private async Task<(string content, List<string> urls)> SearchTavilyAsync(
string itemName, string itemDescription,string locality)
{
try
{
var query = $"{itemName} {itemDescription} suppliers {locality} budget price contact email phone";
var payload = new
{
query,
max_results = 10,
search_depth = "advanced",
include_answer = false
};
var request = new HttpRequestMessage(HttpMethod.Post, _config["Tavily:SearchUrl"]);
request.Headers.Add("Authorization", $"Bearer {_config["Tavily:ApiKey"]}");
request.Content = new StringContent(
JsonSerializer.Serialize(payload), Encoding.UTF8, "application/json");
var response = await _httpClient.SendAsync(request);
response.EnsureSuccessStatusCode();
var body = await response.Content.ReadAsStringAsync();
var result = JsonSerializer.Deserialize<TavilySearchResult>(body,
new JsonSerializerOptions { PropertyNameCaseInsensitive = true });
var sb = new StringBuilder();
var urls = new List<string>();
int i = 1;
foreach (var r in result?.Results ?? new())
{
// Clean text
var clean = Regex.Replace(r.Content ?? "", @"[^\x20-\x7E]", " ");
clean = Regex.Replace(clean, @"\s{3,}", " ");
if (clean.Length > 300) clean = clean[..300];
sb.Append($"{i}. Title:{r.Title}|URL:{r.Url}|Content:{clean}|");
// Collect base domain URLs for contact page fetching
try
{
var uri = new Uri(r.Url);
var baseUrl = $"{uri.Scheme}://{uri.Host}";
if (!urls.Contains(baseUrl)) urls.Add(baseUrl);
}
catch { }
i++;
}
var fullText = sb.ToString();
if (fullText.Length > 2000) fullText = fullText[..2000];
return (fullText, urls);
}
catch (Exception ex)
{
ex.ToString();
throw;
}
}
// ── Fetch Contact Pages ───
private async Task<string> FetchContactPagesAsync(List<string> baseUrls)
{
var sb = new StringBuilder();
// Limit to top 5 domains to avoid timeout
foreach (var baseUrl in baseUrls.Take(5))
{
foreach (var path in ContactPaths)
{
try
{
var url = baseUrl + path;
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(5));
var resp = await _httpClient.GetAsync(url, cts.Token);
if (!resp.IsSuccessStatusCode) continue;
var html = await resp.Content.ReadAsStringAsync();
// Strip HTML tags
var text = Regex.Replace(html, @"<[^>]+>", " ");
text = Regex.Replace(text, @"[^\x20-\x7E]", " ");
text = Regex.Replace(text, @"\s{3,}", " ");
// Extract only lines with contact signals
var contactLines = text.Split(' ', StringSplitOptions.RemoveEmptyEntries)
.Where(w => w.Contains("@")
|| Regex.IsMatch(w, @"\+?[\d\-\(\)]{7,}")
|| w.Contains("email")
|| w.Contains("phone")
|| w.Contains("contact")
|| w.Contains("mobile")
|| w.Contains("tel"))
.Take(50);
var contactText = string.Join(" ", contactLines);
if (!string.IsNullOrWhiteSpace(contactText))
{
sb.Append($"[{baseUrl}]: {contactText} | ");
break; // Found contact page for this domain, move to next
}
}
catch { /* timeout or unreachable — skip */ }
}
}
return sb.Length > 2000 ? sb.ToString()[..2000] : sb.ToString();
}
// ── Groq ─────────────────────────────────────────────────────────────────
private async Task<List<SupplierResponse>> FilterWithGroqAsync(
string itemName, string itemDescription, string searchContent, bool isInternational)
{
try
{
var localityRule = isInternational
? "1. Include suppliers from Philippines first, then other Asian countries (e.g. China, Japan, South Korea, Taiwan, India, Singapore).\n"
: "1. STRICT: Include ONLY suppliers based in the Philippines. Exclude ANY supplier from other countries — even if they ship to Philippines. If a supplier's country is not Philippines, skip it entirely.\n";
var prompt =
$"TASK: Extract up to 10 unique suppliers that sell: [{itemName}] — {itemDescription}.\n\n" +
"RULES:\n" +
localityRule +
"2. Prefer budget-friendly suppliers with known pricing.\n" +
"3. DEDUPLICATION (strict): Each entry must have a unique company_name, contact_email, AND phone_number.\n" +
" - If two entries share the same email OR phone number, keep only the first.\n" +
" - If two inferred emails resolve to the same address, keep only one.\n" +
"4. CONTACT EXTRACTION:\n" +
" - Look in the CONTACT_PAGES_DATA section for real emails and phone numbers.\n" +
" - Use exact values found. Do not fabricate contact details.\n" +
" - If no email is found for a domain, infer: sales@domain.com or info@domain.com.\n" +
" - If no phone is found, use null — do not guess.\n" +
"5. estimated_price_usd MUST be a number (e.g. 12.50) or null. NEVER a string.\n" +
"6. Exclude any supplier with no company_name or no contact_email.\n\n" +
"OUTPUT FORMAT:\n" +
"Return ONLY a valid raw JSON array. No markdown. No explanation. No extra text.\n" +
"Each object must have exactly these fields:\n" +
" company_name (string)\n" +
" country (string)\n" +
" phone_number (string | null)\n" +
" contact_email (string | null)\n" +
" website (string | null)\n" +
" estimated_price_usd (number | null)\n" +
" item_specifications (string | null)\n\n" +
$"DATA:\n{searchContent}";
var payload = new
{
model = _config["Groq:Model"] ?? "llama-3.1-8b-instant",
stream = false,
max_tokens = 2048,
temperature = 0.1,
messages = new[]
{
new { role = "system", content = "You are a supplier data extractor. Extract real contact details from provided content. Return ONLY a valid JSON array, no markdown, no explanation." },
new { role = "user", content = prompt }
}
};
var request = new HttpRequestMessage(HttpMethod.Post, _config["Groq:ApiUrl"]);
request.Headers.Add("Authorization", $"Bearer {_config["Groq:ApiKey"]}");
request.Content = new StringContent(
JsonSerializer.Serialize(payload), Encoding.UTF8, "application/json");
var response = await _httpClient.SendAsync(request);
response.EnsureSuccessStatusCode();
var body = await response.Content.ReadAsStringAsync();
var groqResp = JsonSerializer.Deserialize<GroqResponse>(body,
new JsonSerializerOptions { PropertyNameCaseInsensitive = true });
var rawText = groqResp?.Choices?[0]?.Message?.Content ?? string.Empty;
var match = Regex.Match(rawText, @"\[[\s\S]*\]");
if (!match.Success) return new List<SupplierResponse>();
// Add the converter to the shared options
var jsonOptions = new JsonSerializerOptions
{
PropertyNameCaseInsensitive = true,
Converters = { new FlexibleDecimalConverter() }
};
var groqList = JsonSerializer.Deserialize<List<GroqSupplierResult>>(match.Value, jsonOptions)
?? new List<GroqSupplierResult>();
var seenNames = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
var seenEmails = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
var seenPhones = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
var suppliers = new List<SupplierResponse>();
var allowedCountries = isInternational
? new HashSet<string>(StringComparer.OrdinalIgnoreCase)
{
"Philippines", "China", "Japan", "South Korea", "Taiwan",
"India", "Singapore", "Malaysia", "Thailand", "Vietnam",
"Indonesia", "Hong Kong"
}
: new HashSet<string>(StringComparer.OrdinalIgnoreCase)
{
"Philippines"
};
foreach (var s in groqList)
{
var key = (s.CompanyName ?? "").Trim().ToLower();
var email = (s.ContactEmail ?? "").Trim().ToLower();
var phone = NormalizePhone(s.PhoneNumber ?? "");
// Skip if no company name
if (string.IsNullOrEmpty(key)) continue;
// Skip if no email
if (string.IsNullOrEmpty(email)) continue;
// ✅ Skip if company name, email, OR phone already seen
if (seenNames.Contains(key)) continue;
if (seenEmails.Contains(email)) continue;
if (!string.IsNullOrEmpty(phone) && seenPhones.Contains(phone)) continue;
seenNames.Add(key);
seenEmails.Add(email);
if (!string.IsNullOrEmpty(phone)) seenPhones.Add(phone);
suppliers.Add(new SupplierResponse
{
SupplierName = s.CompanyName,
EmailAddress = s.ContactEmail,
ContactNo = s.PhoneNumber ?? string.Empty,
Address = s.Country ?? string.Empty,
IsActive = true,
VatInc = false,
Currency = "PHP",
CurrencyId = 1,
PaymentTermsId = 1,
PaymentTerms = "30 Days",
LeadTime = "7-14 Days",
TinNo = string.Empty,
ContactPerson = string.Empty,
Website = s.Website ?? string.Empty,
});
if (suppliers.Count >= 10) break;
}
return suppliers;
}
catch (Exception ex)
{
ex.ToString();
throw;
}
}
private static string NormalizePhone(string phone)
{
if (string.IsNullOrWhiteSpace(phone)) return string.Empty;
// Strip everything except digits
var digits = Regex.Replace(phone, @"\D", "");
// Remove leading country code "1" for US/CA numbers (11 digits starting with 1)
if (digits.Length == 11 && digits.StartsWith("1"))
digits = digits[1..];
return digits;
}
private static string ExtractDomain(string url)
{
if (string.IsNullOrWhiteSpace(url)) return string.Empty;
try
{
if (!url.StartsWith("http")) url = "https://" + url;
var host = new Uri(url).Host;
return host.StartsWith("www.") ? host[4..] : host;
}
catch { return string.Empty; }
}
}
}