diff --git a/FFAIall/main.py b/FFAIall/main.py index 414b8ce..7e5622f 100644 --- a/FFAIall/main.py +++ b/FFAIall/main.py @@ -1,5 +1,7 @@ from crawlers import PureHTMLParser # type: ignore from analyzer import PureAnalyzer # type: ignore +import sys +import os class PureInfoHunter: def __init__(self): @@ -37,8 +39,6 @@ class PureInfoHunter: print(f"报告已保存到 reports/{safe_query}_report.txt") if __name__ == "__main__": - import sys - import os os.makedirs("reports", exist_ok=True) if len(sys.argv) < 2: diff --git a/FFAInobug/__pycache__/crawlers.cpython-313.pyc b/FFAInobug/__pycache__/crawlers.cpython-313.pyc index 8bfb92a..7502fc1 100644 Binary files a/FFAInobug/__pycache__/crawlers.cpython-313.pyc and b/FFAInobug/__pycache__/crawlers.cpython-313.pyc differ diff --git a/FFAInobug/crawlers.py b/FFAInobug/crawlers.py index e0130b2..f3c09c2 100644 --- a/FFAInobug/crawlers.py +++ b/FFAInobug/crawlers.py @@ -3,8 +3,12 @@ import os import time from urllib.parse import quote from html.parser import HTMLParser +import requests # type: ignore +from bs4 import BeautifulSoup # type: ignore +from urllib.parse import quote_plus class PureHTMLParser(HTMLParser): + # ...(保持之前的HTML解析器代码不变)... class PureCrawler: @@ -14,6 +18,14 @@ class PureHTMLParser(HTMLParser): self.cache_dir = cache_dir os.makedirs(cache_dir, exist_ok=True) + def _is_cache_valid(self, cache_file): + """检查缓存是否有效""" + if not os.path.exists(cache_file): + return False + + file_time = os.path.getmtime(cache_file) + return (time.time() - file_time) < self.cache_expiry + def _get_cache_path(self, query: str) -> str: """生成缓存文件名""" safe_query = "".join(c if c.isalnum() else "_" for c in query) @@ -51,21 +63,49 @@ class PureHTMLParser(HTMLParser): items.append({"url": url, "text": text}) return items - def fetch(self, query: str, force_update=False) -> list: - """优先读取缓存,不存在时爬取""" - if not force_update: - cached = self._load_from_cache(query) - if cached: - print("📂 从缓存加载数据") - return cached + def fetch(self, query, force_update=False): + + cache_file = os.path.join(self.cache_dir, f"{quote_plus(query)}.json") - print("🌐 正在爬取网络数据...") - # ...(保持原有的爬取逻辑)... - data = [{"url": link, "text": self.extract_text(link)} for link in self.parser.links[:5]] - - self._save_to_cache(query, data) - return data - - def extract_text(self, url: str) -> str: - # ...(保持原有的正文提取逻辑).. - return extracted_text \ No newline at end of file + # 检查缓存是否有效 + if not force_update and self._is_cache_valid(cache_file): + return self._load_from_cache(cache_file) + + try: + # 实际抓取逻辑 - 以百度搜索为例 + search_url = f"https://www.baidu.com/s?wd={quote_plus(query)}" + response = requests.get(search_url, headers=self.headers, timeout=10) + response.raise_for_status() + + # 解析网页内容 + soup = BeautifulSoup(response.text, 'html.parser') + results = [(query)] + + # 提取搜索结果 - 根据实际网站结构调整 + for item in soup.select('.result.c-container'): + title = item.select_one('h3').get_text(strip=True) + link = item.find('a')['href'] + abstract = item.select_one('.c-abstract').get_text(strip=True) if item.select_one('.c-abstract') else "" + results.append({ + 'title': title, + 'url': link, + 'abstract': abstract + }) + + data = { + 'query': query, + 'results': results, + 'timestamp': int(time.time()), + 'sources': [search_url] + } + + # 保存到缓存 + self._save_to_cache(cache_file, data) + return data + + except Exception as e: + # 如果抓取失败但缓存存在,使用缓存 + if os.path.exists(cache_file): + print(f"抓取失败,使用缓存数据: {str(e)}") + return self._load_from_cache(cache_file) + raise RuntimeError(f"抓取失败且无缓存可用: {str(e)}") \ No newline at end of file diff --git a/FFAInobug/main.py b/FFAInobug/main.py index 5cc8931..594b618 100644 --- a/FFAInobug/main.py +++ b/FFAInobug/main.py @@ -41,19 +41,23 @@ if __name__ == "__main__": import os os.makedirs("reports", exist_ok=True) + # 处理参数缺失的情况 if len(sys.argv) < 2: print("使用方法: python pure_main.py '搜索关键词' [force_update]") print("示例: python pure_main.py '人工智能' true") - # sys.exit(1) + query = input("请输入要搜索的关键词: ") # 改为交互式输入 + force_update = input("是否强制更新(true/false)? ").lower() == "true" + else: + query = sys.argv[1] + force_update = len(sys.argv) > 2 and sys.argv[2].lower() == "true" - force_update = len(sys.argv) > 2 and sys.argv[2].lower() == "true" hunter = PureInfoHunter() - # if force_update: - # print("强制更新模式(忽略缓存)") - # data = hunter.crawler.fetch(sys.argv[1], force_update=True) - # result = hunter.analyzer.analyze(data, sys.argv[1]) - # else: - # result = hunter.run(sys.argv[1]) + if force_update: + print("强制更新模式(忽略缓存)") + data = hunter.crawler.fetch(query, force_update=True) + result = hunter.analyzer.analyze(data, query) + else: + result = hunter.run(query) - # print(result) \ No newline at end of file + print(result) \ No newline at end of file