NonInventPurchasingSystem/CPRNIMS.Domain/Services/Canvass/SupplierSearchService.cs

236 lines
10 KiB
C#

using CPRNIMS.Infrastructure.Dto.Canvass.Response;
using CPRNIMS.Infrastructure.Dto.Canvass.Result;
using Microsoft.Extensions.Configuration;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.Json;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
namespace CPRNIMS.Domain.Services.Canvass
{
public class SupplierSearchService
{
private readonly HttpClient _httpClient;
private readonly IConfiguration _config;
// Common contact page suffixes to try
private static readonly string[] ContactPaths =
{ "/contact", "/contact-us", "/pages/contact-us", "/about/contact", "/about" };
public SupplierSearchService(HttpClient httpClient, IConfiguration config)
{
_httpClient = httpClient;
_config = config;
}
public async Task<List<SupplierResponse>> SearchAndFilterSuppliersAsync(
string itemName, string itemDescription, bool isInternational)
{
string locality = "Philippines";
if (isInternational) { locality = "all over the asia including Philippines"; }
else { locality = "Philippines"; }
// Step 1: Tavily — get supplier URLs
var (searchContent, supplierUrls) = await SearchTavilyAsync(itemName, itemDescription, locality);
// Step 2: Fetch contact pages from discovered URLs
var contactContent = await FetchContactPagesAsync(supplierUrls);
// Step 3: Combine search + contact content, send to Groq
var combined = searchContent + " CONTACT_PAGES_DATA: " + contactContent;
var suppliers = await FilterWithGroqAsync(itemName, itemDescription, combined);
return suppliers;
}
// ── Tavily ──
private async Task<(string content, List<string> urls)> SearchTavilyAsync(
string itemName, string itemDescription,string locality)
{
var query = $"{itemName} {itemDescription} suppliers {locality} budget price contact email phone";
var payload = new
{
query,
max_results = 10,
search_depth = "advanced",
include_answer = false
};
var request = new HttpRequestMessage(HttpMethod.Post, _config["Tavily:SearchUrl"]);
request.Headers.Add("Authorization", $"Bearer {_config["Tavily:ApiKey"]}");
request.Content = new StringContent(
JsonSerializer.Serialize(payload), Encoding.UTF8, "application/json");
var response = await _httpClient.SendAsync(request);
response.EnsureSuccessStatusCode();
var body = await response.Content.ReadAsStringAsync();
var result = JsonSerializer.Deserialize<TavilySearchResult>(body,
new JsonSerializerOptions { PropertyNameCaseInsensitive = true });
var sb = new StringBuilder();
var urls = new List<string>();
int i = 1;
foreach (var r in result?.Results ?? new())
{
// Clean text
var clean = Regex.Replace(r.Content ?? "", @"[^\x20-\x7E]", " ");
clean = Regex.Replace(clean, @"\s{3,}", " ");
if (clean.Length > 300) clean = clean[..300];
sb.Append($"{i}. Title:{r.Title}|URL:{r.Url}|Content:{clean}|");
// Collect base domain URLs for contact page fetching
try
{
var uri = new Uri(r.Url);
var baseUrl = $"{uri.Scheme}://{uri.Host}";
if (!urls.Contains(baseUrl)) urls.Add(baseUrl);
}
catch { }
i++;
}
var fullText = sb.ToString();
if (fullText.Length > 2000) fullText = fullText[..2000];
return (fullText, urls);
}
// ── Fetch Contact Pages ───
private async Task<string> FetchContactPagesAsync(List<string> baseUrls)
{
var sb = new StringBuilder();
// Limit to top 5 domains to avoid timeout
foreach (var baseUrl in baseUrls.Take(5))
{
foreach (var path in ContactPaths)
{
try
{
var url = baseUrl + path;
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(5));
var resp = await _httpClient.GetAsync(url, cts.Token);
if (!resp.IsSuccessStatusCode) continue;
var html = await resp.Content.ReadAsStringAsync();
// Strip HTML tags
var text = Regex.Replace(html, @"<[^>]+>", " ");
text = Regex.Replace(text, @"[^\x20-\x7E]", " ");
text = Regex.Replace(text, @"\s{3,}", " ");
// Extract only lines with contact signals
var contactLines = text.Split(' ', StringSplitOptions.RemoveEmptyEntries)
.Where(w => w.Contains("@")
|| Regex.IsMatch(w, @"\+?[\d\-\(\)]{7,}")
|| w.Contains("email")
|| w.Contains("phone")
|| w.Contains("contact")
|| w.Contains("mobile")
|| w.Contains("tel"))
.Take(50);
var contactText = string.Join(" ", contactLines);
if (!string.IsNullOrWhiteSpace(contactText))
{
sb.Append($"[{baseUrl}]: {contactText} | ");
break; // Found contact page for this domain, move to next
}
}
catch { /* timeout or unreachable — skip */ }
}
}
return sb.Length > 2000 ? sb.ToString()[..2000] : sb.ToString();
}
// ── Groq ─────────────────────────────────────────────────────────────────
private async Task<List<SupplierResponse>> FilterWithGroqAsync(
string itemName, string itemDescription, string searchContent)
{
var prompt = $"Extract top 10 unique suppliers for: {itemName} {itemDescription}. " +
"Prioritize Philippines suppliers first. " +
"IMPORTANT: Look carefully in CONTACT_PAGES_DATA section for real phone numbers and emails. " +
"Extract exact email addresses and phone numbers found. " +
"For domains without contact data found, infer email as sales@domain or info@domain. " +
"Prefer budget-friendly suppliers. No duplicates. " +
"Return ONLY a raw JSON array: company_name, country, phone_number, contact_email, website, estimated_price_usd, item_specifications. " +
$"Null for missing. JSON array only. Data: {searchContent}";
var payload = new
{
model = _config["Groq:Model"] ?? "llama-3.1-8b-instant",
stream = false,
max_tokens = 2048,
temperature = 0.1,
messages = new[]
{
new { role = "system", content = "You are a supplier data extractor. Extract real contact details from provided content. Return ONLY a valid JSON array, no markdown, no explanation." },
new { role = "user", content = prompt }
}
};
var request = new HttpRequestMessage(HttpMethod.Post, _config["Groq:ApiUrl"]);
request.Headers.Add("Authorization", $"Bearer {_config["Groq:ApiKey"]}");
request.Content = new StringContent(
JsonSerializer.Serialize(payload), Encoding.UTF8, "application/json");
var response = await _httpClient.SendAsync(request);
response.EnsureSuccessStatusCode();
var body = await response.Content.ReadAsStringAsync();
var groqResp = JsonSerializer.Deserialize<GroqResponse>(body,
new JsonSerializerOptions { PropertyNameCaseInsensitive = true });
var rawText = groqResp?.Choices?[0]?.Message?.Content ?? string.Empty;
var match = Regex.Match(rawText, @"\[[\s\S]*\]");
if (!match.Success) return new List<SupplierResponse>();
var groqList = JsonSerializer.Deserialize<List<GroqSupplierResult>>(match.Value,
new JsonSerializerOptions { PropertyNameCaseInsensitive = true })
?? new List<GroqSupplierResult>();
var seen = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
var suppliers = new List<SupplierResponse>();
foreach (var s in groqList)
{
var key = (s.CompanyName ?? "").Trim().ToLower();
if (string.IsNullOrEmpty(key) || seen.Contains(key)) continue;
seen.Add(key);
if (string.IsNullOrEmpty(s.ContactEmail)) continue;
suppliers.Add(new SupplierResponse
{
SupplierName = s.CompanyName,
EmailAddress = s.ContactEmail,
ContactNo = s.PhoneNumber ?? string.Empty,
Address = s.Country ?? string.Empty,
IsActive = true,
VatInc = false,
Currency = "PHP",
CurrencyId = 1,
PaymentTermsId = 1,
PaymentTerms = "30 Days",
LeadTime = "7-14 Days",
TinNo = string.Empty,
ContactPerson = string.Empty,
Website =s.Website ?? string.Empty,
});
if (suppliers.Count >= 10) break;
}
return suppliers;
}
}
}