using CPRNIMS.Infrastructure.Dto.Canvass.Response; using CPRNIMS.Infrastructure.Dto.Canvass.Result; using Microsoft.Extensions.Configuration; using System; using System.Collections.Generic; using System.Globalization; using System.Linq; using System.Text; using System.Text.Json; using System.Text.Json.Serialization; using System.Text.RegularExpressions; using System.Threading.Tasks; namespace CPRNIMS.Domain.Services.Canvass { public class SupplierSearchService { private readonly HttpClient _httpClient; private readonly IConfiguration _config; // Common contact page suffixes to try private static readonly string[] ContactPaths = { "/contact", "/contact-us", "/pages/contact-us", "/about/contact", "/about" }; /// /// Uses Groq to fuzzy-match a new supplier against existing ones. /// Handles rebranding, spacing in phone numbers, name variations, etc. /// Returns the matched existing SupplierId, or null if no match. /// public async Task FindMatchingExistingSupplierAsync( SupplierResponse incoming, List existingSuppliers) { if (!existingSuppliers.Any()) return null; // ── Layer 1: Exact C# match (fast, free, no API call) ────────────── var incomingEmail = (incoming.EmailAddress ?? "").Trim().ToLower(); var incomingPhone = NormalizePhone(incoming.ContactNo ?? ""); var incomingDomain = ExtractDomain(incoming.Website ?? ""); foreach (var s in existingSuppliers) { var existEmail = (s.EmailAddress ?? "").Trim().ToLower(); var existPhone = NormalizePhone(s.ContactNo ?? ""); var existDomain = ExtractDomain(s.Website ?? ""); if (!string.IsNullOrEmpty(incomingEmail) && incomingEmail == existEmail) return s.SupplierId; if (!string.IsNullOrEmpty(incomingPhone) && incomingPhone == existPhone) return s.SupplierId; if (!string.IsNullOrEmpty(incomingDomain) && incomingDomain == existDomain) return s.SupplierId; } // ── Layer 2: Fuzzy C# pre-filter — narrow to top candidates ──────── var incomingName = (incoming.SupplierName ?? "").ToLower(); var candidates = existingSuppliers .Where(s => { var name = (s.SupplierName ?? "").ToLower(); // Keep if first word matches (e.g. "Linde" in "Linde PH" vs "Linde Philippines") var incomingFirstWord = incomingName.Split(' ').FirstOrDefault() ?? ""; var existFirstWord = name.Split(' ').FirstOrDefault() ?? ""; return !string.IsNullOrEmpty(incomingFirstWord) && incomingFirstWord.Length > 2 // ignore short words like "co", "ph" && existFirstWord.StartsWith(incomingFirstWord, StringComparison.OrdinalIgnoreCase); }) .Take(5) // max 5 candidates to Groq — well within token limit .Select(s => new { s.SupplierId, s.SupplierName, s.EmailAddress, s.ContactNo, s.Website }) .ToList(); // No fuzzy candidates found — it's a new supplier if (!candidates.Any()) return null; // ── Layer 3: Groq fuzzy match — only on small candidate list ──────── var incomingJson = JsonSerializer.Serialize(new { incoming.SupplierName, incoming.EmailAddress, incoming.ContactNo, incoming.Website }); var candidatesJson = JsonSerializer.Serialize(candidates); var prompt = "TASK: Determine if the INCOMING supplier already exists in the CANDIDATES list.\n\n" + "MATCHING RULES (any one is enough):\n" + "1. Same email address (case-insensitive).\n" + "2. Same phone number after stripping spaces, dashes, country codes.\n" + "3. Same company despite rebranding, abbreviation, or spacing differences.\n" + "4. Same website domain (ignore www, http/https).\n\n" + "If matched: respond ONLY { \"matched\": true, \"supplierId\": }\n" + "If not matched: respond ONLY { \"matched\": false, \"supplierId\": null }\n" + "No explanation. No markdown. JSON only.\n\n" + $"INCOMING:\n{incomingJson}\n\n" + $"CANDIDATES:\n{candidatesJson}"; var payload = new { model = _config["Groq:Model"] ?? "llama-3.1-8b-instant", stream = false, max_tokens = 50, temperature = 0, messages = new[] { new { role = "system", content = "You are a supplier deduplication engine. Return ONLY valid JSON. No markdown. No explanation." }, new { role = "user", content = prompt } } }; var request = new HttpRequestMessage(HttpMethod.Post, _config["Groq:ApiUrl"]); request.Headers.Add("Authorization", $"Bearer {_config["Groq:ApiKey"]}"); request.Content = new StringContent( JsonSerializer.Serialize(payload), Encoding.UTF8, "application/json"); var response = await _httpClient.SendAsync(request); response.EnsureSuccessStatusCode(); var body = await response.Content.ReadAsStringAsync(); var groqResp = JsonSerializer.Deserialize(body, new JsonSerializerOptions { PropertyNameCaseInsensitive = true }); var rawText = groqResp?.Choices?[0]?.Message?.Content ?? string.Empty; rawText = Regex.Replace(rawText, @"```[a-z]*|```", "").Trim(); var match = JsonSerializer.Deserialize(rawText, new JsonSerializerOptions { PropertyNameCaseInsensitive = true }); return match?.Matched == true ? match.SupplierId : null; } public SupplierSearchService(HttpClient httpClient, IConfiguration config) { _httpClient = httpClient; _config = config; } public async Task> SearchAndFilterSuppliersAsync( string itemName, string itemDescription, bool isInternational) { var locality = isInternational ? "all over Asia including Philippines" : "Philippines"; // Step 1: Tavily — get supplier URLs var (searchContent, supplierUrls) = await SearchTavilyAsync(itemName, itemDescription, locality); // Step 2: Fetch contact pages from discovered URLs var contactContent = await FetchContactPagesAsync(supplierUrls); // Step 3: Combine search + contact content, send to Groq var combined = searchContent + " CONTACT_PAGES_DATA: " + contactContent; var suppliers = await FilterWithGroqAsync(itemName, itemDescription, combined,isInternational); return suppliers; } // ── Tavily ── private async Task<(string content, List urls)> SearchTavilyAsync( string itemName, string itemDescription,string locality) { try { var query = $"{itemName} {itemDescription} suppliers {locality} budget price contact email phone"; var payload = new { query, max_results = 10, search_depth = "advanced", include_answer = false }; var request = new HttpRequestMessage(HttpMethod.Post, _config["Tavily:SearchUrl"]); request.Headers.Add("Authorization", $"Bearer {_config["Tavily:ApiKey"]}"); request.Content = new StringContent( JsonSerializer.Serialize(payload), Encoding.UTF8, "application/json"); var response = await _httpClient.SendAsync(request); response.EnsureSuccessStatusCode(); var body = await response.Content.ReadAsStringAsync(); var result = JsonSerializer.Deserialize(body, new JsonSerializerOptions { PropertyNameCaseInsensitive = true }); var sb = new StringBuilder(); var urls = new List(); int i = 1; foreach (var r in result?.Results ?? new()) { // Clean text var clean = Regex.Replace(r.Content ?? "", @"[^\x20-\x7E]", " "); clean = Regex.Replace(clean, @"\s{3,}", " "); if (clean.Length > 300) clean = clean[..300]; sb.Append($"{i}. Title:{r.Title}|URL:{r.Url}|Content:{clean}|"); // Collect base domain URLs for contact page fetching try { var uri = new Uri(r.Url); var baseUrl = $"{uri.Scheme}://{uri.Host}"; if (!urls.Contains(baseUrl)) urls.Add(baseUrl); } catch { } i++; } var fullText = sb.ToString(); if (fullText.Length > 2000) fullText = fullText[..2000]; return (fullText, urls); } catch (Exception ex) { ex.ToString(); throw; } } // ── Fetch Contact Pages ─── private async Task FetchContactPagesAsync(List baseUrls) { var sb = new StringBuilder(); // Limit to top 5 domains to avoid timeout foreach (var baseUrl in baseUrls.Take(5)) { foreach (var path in ContactPaths) { try { var url = baseUrl + path; using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(5)); var resp = await _httpClient.GetAsync(url, cts.Token); if (!resp.IsSuccessStatusCode) continue; var html = await resp.Content.ReadAsStringAsync(); // Strip HTML tags var text = Regex.Replace(html, @"<[^>]+>", " "); text = Regex.Replace(text, @"[^\x20-\x7E]", " "); text = Regex.Replace(text, @"\s{3,}", " "); // Extract only lines with contact signals var contactLines = text.Split(' ', StringSplitOptions.RemoveEmptyEntries) .Where(w => w.Contains("@") || Regex.IsMatch(w, @"\+?[\d\-\(\)]{7,}") || w.Contains("email") || w.Contains("phone") || w.Contains("contact") || w.Contains("mobile") || w.Contains("tel")) .Take(50); var contactText = string.Join(" ", contactLines); if (!string.IsNullOrWhiteSpace(contactText)) { sb.Append($"[{baseUrl}]: {contactText} | "); break; // Found contact page for this domain, move to next } } catch { /* timeout or unreachable — skip */ } } } return sb.Length > 2000 ? sb.ToString()[..2000] : sb.ToString(); } // ── Groq ───────────────────────────────────────────────────────────────── private async Task> FilterWithGroqAsync( string itemName, string itemDescription, string searchContent, bool isInternational) { try { var localityRule = isInternational ? "1. Include suppliers from Philippines first, then other Asian countries (e.g. China, Japan, South Korea, Taiwan, India, Singapore).\n" : "1. STRICT: Include ONLY suppliers based in the Philippines. Exclude ANY supplier from other countries — even if they ship to Philippines. If a supplier's country is not Philippines, skip it entirely.\n"; var prompt = $"TASK: Extract up to 10 unique suppliers that sell: [{itemName}] — {itemDescription}.\n\n" + "RULES:\n" + localityRule + "2. Prefer budget-friendly suppliers with known pricing.\n" + "3. DEDUPLICATION (strict): Each entry must have a unique company_name, contact_email, AND phone_number.\n" + " - If two entries share the same email OR phone number, keep only the first.\n" + " - If two inferred emails resolve to the same address, keep only one.\n" + "4. CONTACT EXTRACTION:\n" + " - Look in the CONTACT_PAGES_DATA section for real emails and phone numbers.\n" + " - Use exact values found. Do not fabricate contact details.\n" + " - If no email is found for a domain, infer: sales@domain.com or info@domain.com.\n" + " - If no phone is found, use null — do not guess.\n" + "5. estimated_price_usd MUST be a number (e.g. 12.50) or null. NEVER a string.\n" + "6. Exclude any supplier with no company_name or no contact_email.\n\n" + "OUTPUT FORMAT:\n" + "Return ONLY a valid raw JSON array. No markdown. No explanation. No extra text.\n" + "Each object must have exactly these fields:\n" + " company_name (string)\n" + " country (string)\n" + " phone_number (string | null)\n" + " contact_email (string | null)\n" + " website (string | null)\n" + " estimated_price_usd (number | null)\n" + " item_specifications (string | null)\n\n" + $"DATA:\n{searchContent}"; var payload = new { model = _config["Groq:Model"] ?? "llama-3.1-8b-instant", stream = false, max_tokens = 2048, temperature = 0.1, messages = new[] { new { role = "system", content = "You are a supplier data extractor. Extract real contact details from provided content. Return ONLY a valid JSON array, no markdown, no explanation." }, new { role = "user", content = prompt } } }; var request = new HttpRequestMessage(HttpMethod.Post, _config["Groq:ApiUrl"]); request.Headers.Add("Authorization", $"Bearer {_config["Groq:ApiKey"]}"); request.Content = new StringContent( JsonSerializer.Serialize(payload), Encoding.UTF8, "application/json"); var response = await _httpClient.SendAsync(request); response.EnsureSuccessStatusCode(); var body = await response.Content.ReadAsStringAsync(); var groqResp = JsonSerializer.Deserialize(body, new JsonSerializerOptions { PropertyNameCaseInsensitive = true }); var rawText = groqResp?.Choices?[0]?.Message?.Content ?? string.Empty; var match = Regex.Match(rawText, @"\[[\s\S]*\]"); if (!match.Success) return new List(); // Add the converter to the shared options var jsonOptions = new JsonSerializerOptions { PropertyNameCaseInsensitive = true, Converters = { new FlexibleDecimalConverter() } }; var groqList = JsonSerializer.Deserialize>(match.Value, jsonOptions) ?? new List(); var seenNames = new HashSet(StringComparer.OrdinalIgnoreCase); var seenEmails = new HashSet(StringComparer.OrdinalIgnoreCase); var seenPhones = new HashSet(StringComparer.OrdinalIgnoreCase); var suppliers = new List(); var allowedCountries = isInternational ? new HashSet(StringComparer.OrdinalIgnoreCase) { "Philippines", "China", "Japan", "South Korea", "Taiwan", "India", "Singapore", "Malaysia", "Thailand", "Vietnam", "Indonesia", "Hong Kong" } : new HashSet(StringComparer.OrdinalIgnoreCase) { "Philippines" }; foreach (var s in groqList) { var key = (s.CompanyName ?? "").Trim().ToLower(); var email = (s.ContactEmail ?? "").Trim().ToLower(); var phone = NormalizePhone(s.PhoneNumber ?? ""); // Skip if no company name if (string.IsNullOrEmpty(key)) continue; // Skip if no email if (string.IsNullOrEmpty(email)) continue; // ✅ Skip if company name, email, OR phone already seen if (seenNames.Contains(key)) continue; if (seenEmails.Contains(email)) continue; if (!string.IsNullOrEmpty(phone) && seenPhones.Contains(phone)) continue; seenNames.Add(key); seenEmails.Add(email); if (!string.IsNullOrEmpty(phone)) seenPhones.Add(phone); suppliers.Add(new SupplierResponse { SupplierName = s.CompanyName, EmailAddress = s.ContactEmail, ContactNo = s.PhoneNumber ?? string.Empty, Address = s.Country ?? string.Empty, IsActive = true, VatInc = false, Currency = "PHP", CurrencyId = 1, PaymentTermsId = 1, PaymentTerms = "30 Days", LeadTime = "7-14 Days", TinNo = string.Empty, ContactPerson = string.Empty, Website = s.Website ?? string.Empty, }); if (suppliers.Count >= 10) break; } return suppliers; } catch (Exception ex) { ex.ToString(); throw; } } private static string NormalizePhone(string phone) { if (string.IsNullOrWhiteSpace(phone)) return string.Empty; // Strip everything except digits var digits = Regex.Replace(phone, @"\D", ""); // Remove leading country code "1" for US/CA numbers (11 digits starting with 1) if (digits.Length == 11 && digits.StartsWith("1")) digits = digits[1..]; return digits; } private static string ExtractDomain(string url) { if (string.IsNullOrWhiteSpace(url)) return string.Empty; try { if (!url.StartsWith("http")) url = "https://" + url; var host = new Uri(url).Host; return host.StartsWith("www.") ? host[4..] : host; } catch { return string.Empty; } } } }