236 lines
10 KiB
C#
236 lines
10 KiB
C#
using CPRNIMS.Infrastructure.Dto.Canvass.Response;
|
|
using CPRNIMS.Infrastructure.Dto.Canvass.Result;
|
|
using Microsoft.Extensions.Configuration;
|
|
using System;
|
|
using System.Collections.Generic;
|
|
using System.Linq;
|
|
using System.Text;
|
|
using System.Text.Json;
|
|
using System.Text.RegularExpressions;
|
|
using System.Threading.Tasks;
|
|
|
|
namespace CPRNIMS.Domain.Services.Canvass
|
|
{
|
|
public class SupplierSearchService
|
|
{
|
|
private readonly HttpClient _httpClient;
|
|
private readonly IConfiguration _config;
|
|
|
|
// Common contact page suffixes to try
|
|
private static readonly string[] ContactPaths =
|
|
{ "/contact", "/contact-us", "/pages/contact-us", "/about/contact", "/about" };
|
|
|
|
public SupplierSearchService(HttpClient httpClient, IConfiguration config)
|
|
{
|
|
_httpClient = httpClient;
|
|
_config = config;
|
|
}
|
|
|
|
public async Task<List<SupplierResponse>> SearchAndFilterSuppliersAsync(
|
|
string itemName, string itemDescription, bool isInternational)
|
|
{
|
|
string locality = "Philippines";
|
|
if (isInternational) { locality = "all over the asia including Philippines"; }
|
|
else { locality = "Philippines"; }
|
|
|
|
// Step 1: Tavily — get supplier URLs
|
|
var (searchContent, supplierUrls) = await SearchTavilyAsync(itemName, itemDescription, locality);
|
|
|
|
// Step 2: Fetch contact pages from discovered URLs
|
|
var contactContent = await FetchContactPagesAsync(supplierUrls);
|
|
|
|
// Step 3: Combine search + contact content, send to Groq
|
|
var combined = searchContent + " CONTACT_PAGES_DATA: " + contactContent;
|
|
var suppliers = await FilterWithGroqAsync(itemName, itemDescription, combined);
|
|
|
|
return suppliers;
|
|
}
|
|
|
|
// ── Tavily ──
|
|
private async Task<(string content, List<string> urls)> SearchTavilyAsync(
|
|
string itemName, string itemDescription,string locality)
|
|
{
|
|
var query = $"{itemName} {itemDescription} suppliers {locality} budget price contact email phone";
|
|
|
|
var payload = new
|
|
{
|
|
query,
|
|
max_results = 10,
|
|
search_depth = "advanced",
|
|
include_answer = false
|
|
};
|
|
|
|
var request = new HttpRequestMessage(HttpMethod.Post, _config["Tavily:SearchUrl"]);
|
|
request.Headers.Add("Authorization", $"Bearer {_config["Tavily:ApiKey"]}");
|
|
request.Content = new StringContent(
|
|
JsonSerializer.Serialize(payload), Encoding.UTF8, "application/json");
|
|
|
|
var response = await _httpClient.SendAsync(request);
|
|
response.EnsureSuccessStatusCode();
|
|
|
|
var body = await response.Content.ReadAsStringAsync();
|
|
var result = JsonSerializer.Deserialize<TavilySearchResult>(body,
|
|
new JsonSerializerOptions { PropertyNameCaseInsensitive = true });
|
|
|
|
var sb = new StringBuilder();
|
|
var urls = new List<string>();
|
|
int i = 1;
|
|
|
|
foreach (var r in result?.Results ?? new())
|
|
{
|
|
// Clean text
|
|
var clean = Regex.Replace(r.Content ?? "", @"[^\x20-\x7E]", " ");
|
|
clean = Regex.Replace(clean, @"\s{3,}", " ");
|
|
if (clean.Length > 300) clean = clean[..300];
|
|
sb.Append($"{i}. Title:{r.Title}|URL:{r.Url}|Content:{clean}|");
|
|
|
|
// Collect base domain URLs for contact page fetching
|
|
try
|
|
{
|
|
var uri = new Uri(r.Url);
|
|
var baseUrl = $"{uri.Scheme}://{uri.Host}";
|
|
if (!urls.Contains(baseUrl)) urls.Add(baseUrl);
|
|
}
|
|
catch { }
|
|
i++;
|
|
}
|
|
|
|
var fullText = sb.ToString();
|
|
if (fullText.Length > 2000) fullText = fullText[..2000];
|
|
|
|
return (fullText, urls);
|
|
}
|
|
|
|
// ── Fetch Contact Pages ───
|
|
private async Task<string> FetchContactPagesAsync(List<string> baseUrls)
|
|
{
|
|
var sb = new StringBuilder();
|
|
// Limit to top 5 domains to avoid timeout
|
|
foreach (var baseUrl in baseUrls.Take(5))
|
|
{
|
|
foreach (var path in ContactPaths)
|
|
{
|
|
try
|
|
{
|
|
var url = baseUrl + path;
|
|
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(5));
|
|
var resp = await _httpClient.GetAsync(url, cts.Token);
|
|
|
|
if (!resp.IsSuccessStatusCode) continue;
|
|
|
|
var html = await resp.Content.ReadAsStringAsync();
|
|
|
|
// Strip HTML tags
|
|
var text = Regex.Replace(html, @"<[^>]+>", " ");
|
|
text = Regex.Replace(text, @"[^\x20-\x7E]", " ");
|
|
text = Regex.Replace(text, @"\s{3,}", " ");
|
|
|
|
// Extract only lines with contact signals
|
|
var contactLines = text.Split(' ', StringSplitOptions.RemoveEmptyEntries)
|
|
.Where(w => w.Contains("@")
|
|
|| Regex.IsMatch(w, @"\+?[\d\-\(\)]{7,}")
|
|
|| w.Contains("email")
|
|
|| w.Contains("phone")
|
|
|| w.Contains("contact")
|
|
|| w.Contains("mobile")
|
|
|| w.Contains("tel"))
|
|
.Take(50);
|
|
|
|
var contactText = string.Join(" ", contactLines);
|
|
if (!string.IsNullOrWhiteSpace(contactText))
|
|
{
|
|
sb.Append($"[{baseUrl}]: {contactText} | ");
|
|
break; // Found contact page for this domain, move to next
|
|
}
|
|
}
|
|
catch { /* timeout or unreachable — skip */ }
|
|
}
|
|
}
|
|
|
|
return sb.Length > 2000 ? sb.ToString()[..2000] : sb.ToString();
|
|
}
|
|
|
|
// ── Groq ─────────────────────────────────────────────────────────────────
|
|
private async Task<List<SupplierResponse>> FilterWithGroqAsync(
|
|
string itemName, string itemDescription, string searchContent)
|
|
{
|
|
var prompt = $"Extract top 10 unique suppliers for: {itemName} {itemDescription}. " +
|
|
"Prioritize Philippines suppliers first. " +
|
|
"IMPORTANT: Look carefully in CONTACT_PAGES_DATA section for real phone numbers and emails. " +
|
|
"Extract exact email addresses and phone numbers found. " +
|
|
"For domains without contact data found, infer email as sales@domain or info@domain. " +
|
|
"Prefer budget-friendly suppliers. No duplicates. " +
|
|
"Return ONLY a raw JSON array: company_name, country, phone_number, contact_email, website, estimated_price_usd, item_specifications. " +
|
|
$"Null for missing. JSON array only. Data: {searchContent}";
|
|
|
|
var payload = new
|
|
{
|
|
model = _config["Groq:Model"] ?? "llama-3.1-8b-instant",
|
|
stream = false,
|
|
max_tokens = 2048,
|
|
temperature = 0.1,
|
|
messages = new[]
|
|
{
|
|
new { role = "system", content = "You are a supplier data extractor. Extract real contact details from provided content. Return ONLY a valid JSON array, no markdown, no explanation." },
|
|
new { role = "user", content = prompt }
|
|
}
|
|
};
|
|
|
|
var request = new HttpRequestMessage(HttpMethod.Post, _config["Groq:ApiUrl"]);
|
|
request.Headers.Add("Authorization", $"Bearer {_config["Groq:ApiKey"]}");
|
|
request.Content = new StringContent(
|
|
JsonSerializer.Serialize(payload), Encoding.UTF8, "application/json");
|
|
|
|
var response = await _httpClient.SendAsync(request);
|
|
response.EnsureSuccessStatusCode();
|
|
|
|
var body = await response.Content.ReadAsStringAsync();
|
|
var groqResp = JsonSerializer.Deserialize<GroqResponse>(body,
|
|
new JsonSerializerOptions { PropertyNameCaseInsensitive = true });
|
|
|
|
var rawText = groqResp?.Choices?[0]?.Message?.Content ?? string.Empty;
|
|
|
|
var match = Regex.Match(rawText, @"\[[\s\S]*\]");
|
|
if (!match.Success) return new List<SupplierResponse>();
|
|
|
|
var groqList = JsonSerializer.Deserialize<List<GroqSupplierResult>>(match.Value,
|
|
new JsonSerializerOptions { PropertyNameCaseInsensitive = true })
|
|
?? new List<GroqSupplierResult>();
|
|
|
|
var seen = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
|
var suppliers = new List<SupplierResponse>();
|
|
|
|
foreach (var s in groqList)
|
|
{
|
|
var key = (s.CompanyName ?? "").Trim().ToLower();
|
|
if (string.IsNullOrEmpty(key) || seen.Contains(key)) continue;
|
|
seen.Add(key);
|
|
|
|
if (string.IsNullOrEmpty(s.ContactEmail)) continue;
|
|
|
|
suppliers.Add(new SupplierResponse
|
|
{
|
|
SupplierName = s.CompanyName,
|
|
EmailAddress = s.ContactEmail,
|
|
ContactNo = s.PhoneNumber ?? string.Empty,
|
|
Address = s.Country ?? string.Empty,
|
|
IsActive = true,
|
|
VatInc = false,
|
|
Currency = "PHP",
|
|
CurrencyId = 1,
|
|
PaymentTermsId = 1,
|
|
PaymentTerms = "30 Days",
|
|
LeadTime = "7-14 Days",
|
|
TinNo = string.Empty,
|
|
ContactPerson = string.Empty,
|
|
Website =s.Website ?? string.Empty,
|
|
});
|
|
|
|
if (suppliers.Count >= 10) break;
|
|
}
|
|
|
|
return suppliers;
|
|
}
|
|
}
|
|
}
|