diff --git a/FFAIall(暂停)/__pycache__/analyzer.cpython-313.pyc b/FFAIall(暂停)/__pycache__/analyzer.cpython-313.pyc deleted file mode 100644 index ca0b8e4..0000000 Binary files a/FFAIall(暂停)/__pycache__/analyzer.cpython-313.pyc and /dev/null differ diff --git a/FFAIall(暂停)/__pycache__/crawlers.cpython-313.pyc b/FFAIall(暂停)/__pycache__/crawlers.cpython-313.pyc deleted file mode 100644 index 8bfb92a..0000000 Binary files a/FFAIall(暂停)/__pycache__/crawlers.cpython-313.pyc and /dev/null differ diff --git a/FFAIall(暂停)/analyzer.py b/FFAIall(暂停)/analyzer.py deleted file mode 100644 index 19b68d5..0000000 --- a/FFAIall(暂停)/analyzer.py +++ /dev/null @@ -1,39 +0,0 @@ -import re -from collections import Counter - -class PureAnalyzer: - @staticmethod - def search_in_cache(query: str, cache_dir="cache") -> list: - """在缓存中检索历史记录""" - if not os.path.exists(cache_dir): - return [] - - related_files = [] - safe_query = query.lower() - for filename in os.listdir(cache_dir): - if safe_query in filename.lower(): - with open(f"{cache_dir}/{filename}", "r", encoding="utf-8") as f: - content = f.read() - related_files.append({ - "query": filename.replace(".txt", ""), - "content": content - }) - return related_files - - @staticmethod - def analyze(data: list, query: str) -> dict: - # 先检查缓存中的相关记录 - history = PureAnalyzer.search_in_cache(query) - - # 合并新旧数据 - all_text = " ".join(d.get("text", "") for d in data) - if history: - all_text += " " + " ".join(h["content"] for h in history) - - # ...(保持原有的分析逻辑)... - return { - "summary": summary, - "keywords": keywords, - "sources": [d["url"] for d in data], - "related_history": [h["query"] for h in history] - } \ No newline at end of file diff --git a/FFAIall(暂停)/crawlers.py b/FFAIall(暂停)/crawlers.py deleted file mode 100644 index e0130b2..0000000 --- a/FFAIall(暂停)/crawlers.py +++ /dev/null @@ -1,71 +0,0 @@ -import urllib.request -import os -import time -from urllib.parse import quote -from html.parser import HTMLParser - -class PureHTMLParser(HTMLParser): - # ...(保持之前的HTML解析器代码不变)... - - class PureCrawler: - def __init__(self, cache_dir="cache"): - self.user_agent = "Mozilla/5.0" - self.parser = PureHTMLParser() - self.cache_dir = cache_dir - os.makedirs(cache_dir, exist_ok=True) - - def _get_cache_path(self, query: str) -> str: - """生成缓存文件名""" - safe_query = "".join(c if c.isalnum() else "_" for c in query) - return f"{self.cache_dir}/{safe_query}.txt" - - def _save_to_cache(self, query: str, data: list): - """保存搜索结果到缓存""" - with open(self._get_cache_path(query), "w", encoding="utf-8") as f: - for item in data: - f.write(f"URL: {item['url']}\n") - f.write(f"Text: {item['text']}\n") - f.write("="*50 + "\n") - - def _load_from_cache(self, query: str) -> list: - """从缓存加载数据""" - cache_file = self._get_cache_path(query) - if not os.path.exists(cache_file): - return None - - with open(cache_file, "r", encoding="utf-8") as f: - content = f.read() - - # 解析缓存文件 - items = [] - for block in content.split("="*50): - if not block.strip(): - continue - url = text = "" - for line in block.split("\n"): - if line.startswith("URL: "): - url = line[5:] - elif line.startswith("Text: "): - text = line[6:] - if url: - items.append({"url": url, "text": text}) - return items - - def fetch(self, query: str, force_update=False) -> list: - """优先读取缓存,不存在时爬取""" - if not force_update: - cached = self._load_from_cache(query) - if cached: - print("📂 从缓存加载数据") - return cached - - print("🌐 正在爬取网络数据...") - # ...(保持原有的爬取逻辑)... - data = [{"url": link, "text": self.extract_text(link)} for link in self.parser.links[:5]] - - self._save_to_cache(query, data) - return data - - def extract_text(self, url: str) -> str: - # ...(保持原有的正文提取逻辑).. - return extracted_text \ No newline at end of file diff --git a/FFAIall(暂停)/main.py b/FFAIall(暂停)/main.py deleted file mode 100644 index 7e5622f..0000000 --- a/FFAIall(暂停)/main.py +++ /dev/null @@ -1,59 +0,0 @@ -from crawlers import PureHTMLParser # type: ignore -from analyzer import PureAnalyzer # type: ignore -import sys -import os - -class PureInfoHunter: - def __init__(self): - self.crawler = PureHTMLParser() - self.analyzer = PureAnalyzer() - - def run(self, query: str): - # 1. 获取数据(优先缓存) - data = self.crawler.fetch(query) - - # 2. 分析(自动检索历史缓存) - result = self.analyzer.analyze(data, query) - - # 3. 生成报告 - report = "="*40 + "\n" - report += f"搜索词: {query}\n" - - if result.get("related_history"): - report += f"关联历史记录: {', '.join(result['related_history'])}\n" - - report += "\n分析结果:\n" + result["summary"] + "\n" - report += "数据来源:\n" - for url in result["sources"]: - report += f"- {url}\n" - - # 保存本次报告 - self._save_report(query, report) - return report - - def _save_report(self, query: str, content: str): - """保存分析报告""" - safe_query = "".join(c if c.isalnum() else "_" for c in query) - with open(f"reports/{safe_query}_report.txt", "w", encoding="utf-8") as f: - f.write(content) - print(f"报告已保存到 reports/{safe_query}_report.txt") - -if __name__ == "__main__": - os.makedirs("reports", exist_ok=True) - - if len(sys.argv) < 2: - print("使用方法: python pure_main.py '搜索关键词' [force_update]") - print("示例: python pure_main.py '人工智能' true") - sys.exit(1) - - force_update = len(sys.argv) > 2 and sys.argv[2].lower() == "true" - hunter = PureInfoHunter() - - if force_update: - print("强制更新模式(忽略缓存)") - data = hunter.crawler.fetch(sys.argv[1], force_update=True) - result = hunter.analyzer.analyze(data, sys.argv[1]) - else: - result = hunter.run(sys.argv[1]) - - print(result) \ No newline at end of file