From b1652bc1c0732239928c848b700125588a7d65dd Mon Sep 17 00:00:00 2001 From: Friendfeng <3880261409@qq.com> Date: Sat, 7 Jun 2025 02:41:28 +0800 Subject: [PATCH] modified: FFAInobug/crawlers.py modified: FFAInobug/main.py --- FFAInobug/crawlers.py | 46 +++++++++++++++++++++++++------------------ FFAInobug/main.py | 2 +- 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/FFAInobug/crawlers.py b/FFAInobug/crawlers.py index f3c09c2..ed37cb3 100644 --- a/FFAInobug/crawlers.py +++ b/FFAInobug/crawlers.py @@ -10,11 +10,9 @@ from urllib.parse import quote_plus class PureHTMLParser(HTMLParser): # ...(保持之前的HTML解析器代码不变)... - - class PureCrawler: def __init__(self, cache_dir="cache"): self.user_agent = "Mozilla/5.0" - self.parser = PureHTMLParser() + # self.parser = PureHTMLParser() self.cache_dir = cache_dir os.makedirs(cache_dir, exist_ok=True) @@ -35,8 +33,8 @@ class PureHTMLParser(HTMLParser): """保存搜索结果到缓存""" with open(self._get_cache_path(query), "w", encoding="utf-8") as f: for item in data: - f.write(f"URL: {item['url']}\n") - f.write(f"Text: {item['text']}\n") + f.write(f"URL: {item['url','']}\n") + f.write(f"Text: {'abstract', item.get('text', '')}\n") f.write("="*50 + "\n") def _load_from_cache(self, query: str) -> list: @@ -64,11 +62,16 @@ class PureHTMLParser(HTMLParser): return items def fetch(self, query, force_update=False): - + # 确保有默认headers + if not hasattr(self, 'headers'): + self.headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + } + cache_file = os.path.join(self.cache_dir, f"{quote_plus(query)}.json") # 检查缓存是否有效 - if not force_update and self._is_cache_valid(cache_file): + if not force_update and os.path.exists(cache_file) and self._is_cache_valid(cache_file): return self._load_from_cache(cache_file) try: @@ -79,29 +82,34 @@ class PureHTMLParser(HTMLParser): # 解析网页内容 soup = BeautifulSoup(response.text, 'html.parser') - results = [(query)] + results = [] - # 提取搜索结果 - 根据实际网站结构调整 + # 提取搜索结果 - 百度搜索结果的实际选择器可能需要调整 for item in soup.select('.result.c-container'): - title = item.select_one('h3').get_text(strip=True) - link = item.find('a')['href'] - abstract = item.select_one('.c-abstract').get_text(strip=True) if item.select_one('.c-abstract') else "" - results.append({ - 'title': title, - 'url': link, - 'abstract': abstract - }) + title_elem = item.select_one('h3') + link_elem = item.find('a') + abstract_elem = item.select_one('.c-abstract') + + if title_elem and link_elem: + results.append({ + 'title': title_elem.get_text(strip=True), + 'url': link_elem.get('href'), + 'abstract': abstract_elem.get_text(strip=True) if abstract_elem else "" + }) data = { 'query': query, - 'results': results, + 'results': results if results else [{'title': '无结果', 'url': '', 'abstract': ''}], 'timestamp': int(time.time()), 'sources': [search_url] } # 保存到缓存 self._save_to_cache(cache_file, data) - return data + return { + 'data': data, + 'sources': ["www.baidu.com"] + } except Exception as e: # 如果抓取失败但缓存存在,使用缓存 diff --git a/FFAInobug/main.py b/FFAInobug/main.py index 594b618..11a846d 100644 --- a/FFAInobug/main.py +++ b/FFAInobug/main.py @@ -55,7 +55,7 @@ if __name__ == "__main__": if force_update: print("强制更新模式(忽略缓存)") - data = hunter.crawler.fetch(query, force_update=True) + data = hunter.crawler.fetch(query) # 使用实际存在的方法名 result = hunter.analyzer.analyze(data, query) else: result = hunter.run(query)