deleted: "FFAIall\357\274\210\346\232\202\345\201\234\357\274\211/__pycache__/analyzer.cpython-313.pyc"
deleted: "FFAIall\357\274\210\346\232\202\345\201\234\357\274\211/__pycache__/crawlers.cpython-313.pyc" deleted: "FFAIall\357\274\210\346\232\202\345\201\234\357\274\211/analyzer.py" deleted: "FFAIall\357\274\210\346\232\202\345\201\234\357\274\211/crawlers.py" deleted: "FFAIall\357\274\210\346\232\202\345\201\234\357\274\211/main.py"
This commit is contained in:
parent
a7abe9d506
commit
1b490d774f
Binary file not shown.
Binary file not shown.
@ -1,39 +0,0 @@
|
|||||||
import re
|
|
||||||
from collections import Counter
|
|
||||||
|
|
||||||
class PureAnalyzer:
|
|
||||||
@staticmethod
|
|
||||||
def search_in_cache(query: str, cache_dir="cache") -> list:
|
|
||||||
"""在缓存中检索历史记录"""
|
|
||||||
if not os.path.exists(cache_dir):
|
|
||||||
return []
|
|
||||||
|
|
||||||
related_files = []
|
|
||||||
safe_query = query.lower()
|
|
||||||
for filename in os.listdir(cache_dir):
|
|
||||||
if safe_query in filename.lower():
|
|
||||||
with open(f"{cache_dir}/{filename}", "r", encoding="utf-8") as f:
|
|
||||||
content = f.read()
|
|
||||||
related_files.append({
|
|
||||||
"query": filename.replace(".txt", ""),
|
|
||||||
"content": content
|
|
||||||
})
|
|
||||||
return related_files
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def analyze(data: list, query: str) -> dict:
|
|
||||||
# 先检查缓存中的相关记录
|
|
||||||
history = PureAnalyzer.search_in_cache(query)
|
|
||||||
|
|
||||||
# 合并新旧数据
|
|
||||||
all_text = " ".join(d.get("text", "") for d in data)
|
|
||||||
if history:
|
|
||||||
all_text += " " + " ".join(h["content"] for h in history)
|
|
||||||
|
|
||||||
# ...(保持原有的分析逻辑)...
|
|
||||||
return {
|
|
||||||
"summary": summary,
|
|
||||||
"keywords": keywords,
|
|
||||||
"sources": [d["url"] for d in data],
|
|
||||||
"related_history": [h["query"] for h in history]
|
|
||||||
}
|
|
@ -1,71 +0,0 @@
|
|||||||
import urllib.request
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
from urllib.parse import quote
|
|
||||||
from html.parser import HTMLParser
|
|
||||||
|
|
||||||
class PureHTMLParser(HTMLParser):
|
|
||||||
# ...(保持之前的HTML解析器代码不变)...
|
|
||||||
|
|
||||||
class PureCrawler:
|
|
||||||
def __init__(self, cache_dir="cache"):
|
|
||||||
self.user_agent = "Mozilla/5.0"
|
|
||||||
self.parser = PureHTMLParser()
|
|
||||||
self.cache_dir = cache_dir
|
|
||||||
os.makedirs(cache_dir, exist_ok=True)
|
|
||||||
|
|
||||||
def _get_cache_path(self, query: str) -> str:
|
|
||||||
"""生成缓存文件名"""
|
|
||||||
safe_query = "".join(c if c.isalnum() else "_" for c in query)
|
|
||||||
return f"{self.cache_dir}/{safe_query}.txt"
|
|
||||||
|
|
||||||
def _save_to_cache(self, query: str, data: list):
|
|
||||||
"""保存搜索结果到缓存"""
|
|
||||||
with open(self._get_cache_path(query), "w", encoding="utf-8") as f:
|
|
||||||
for item in data:
|
|
||||||
f.write(f"URL: {item['url']}\n")
|
|
||||||
f.write(f"Text: {item['text']}\n")
|
|
||||||
f.write("="*50 + "\n")
|
|
||||||
|
|
||||||
def _load_from_cache(self, query: str) -> list:
|
|
||||||
"""从缓存加载数据"""
|
|
||||||
cache_file = self._get_cache_path(query)
|
|
||||||
if not os.path.exists(cache_file):
|
|
||||||
return None
|
|
||||||
|
|
||||||
with open(cache_file, "r", encoding="utf-8") as f:
|
|
||||||
content = f.read()
|
|
||||||
|
|
||||||
# 解析缓存文件
|
|
||||||
items = []
|
|
||||||
for block in content.split("="*50):
|
|
||||||
if not block.strip():
|
|
||||||
continue
|
|
||||||
url = text = ""
|
|
||||||
for line in block.split("\n"):
|
|
||||||
if line.startswith("URL: "):
|
|
||||||
url = line[5:]
|
|
||||||
elif line.startswith("Text: "):
|
|
||||||
text = line[6:]
|
|
||||||
if url:
|
|
||||||
items.append({"url": url, "text": text})
|
|
||||||
return items
|
|
||||||
|
|
||||||
def fetch(self, query: str, force_update=False) -> list:
|
|
||||||
"""优先读取缓存,不存在时爬取"""
|
|
||||||
if not force_update:
|
|
||||||
cached = self._load_from_cache(query)
|
|
||||||
if cached:
|
|
||||||
print("📂 从缓存加载数据")
|
|
||||||
return cached
|
|
||||||
|
|
||||||
print("🌐 正在爬取网络数据...")
|
|
||||||
# ...(保持原有的爬取逻辑)...
|
|
||||||
data = [{"url": link, "text": self.extract_text(link)} for link in self.parser.links[:5]]
|
|
||||||
|
|
||||||
self._save_to_cache(query, data)
|
|
||||||
return data
|
|
||||||
|
|
||||||
def extract_text(self, url: str) -> str:
|
|
||||||
# ...(保持原有的正文提取逻辑)..
|
|
||||||
return extracted_text
|
|
@ -1,59 +0,0 @@
|
|||||||
from crawlers import PureHTMLParser # type: ignore
|
|
||||||
from analyzer import PureAnalyzer # type: ignore
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
|
|
||||||
class PureInfoHunter:
|
|
||||||
def __init__(self):
|
|
||||||
self.crawler = PureHTMLParser()
|
|
||||||
self.analyzer = PureAnalyzer()
|
|
||||||
|
|
||||||
def run(self, query: str):
|
|
||||||
# 1. 获取数据(优先缓存)
|
|
||||||
data = self.crawler.fetch(query)
|
|
||||||
|
|
||||||
# 2. 分析(自动检索历史缓存)
|
|
||||||
result = self.analyzer.analyze(data, query)
|
|
||||||
|
|
||||||
# 3. 生成报告
|
|
||||||
report = "="*40 + "\n"
|
|
||||||
report += f"搜索词: {query}\n"
|
|
||||||
|
|
||||||
if result.get("related_history"):
|
|
||||||
report += f"关联历史记录: {', '.join(result['related_history'])}\n"
|
|
||||||
|
|
||||||
report += "\n分析结果:\n" + result["summary"] + "\n"
|
|
||||||
report += "数据来源:\n"
|
|
||||||
for url in result["sources"]:
|
|
||||||
report += f"- {url}\n"
|
|
||||||
|
|
||||||
# 保存本次报告
|
|
||||||
self._save_report(query, report)
|
|
||||||
return report
|
|
||||||
|
|
||||||
def _save_report(self, query: str, content: str):
|
|
||||||
"""保存分析报告"""
|
|
||||||
safe_query = "".join(c if c.isalnum() else "_" for c in query)
|
|
||||||
with open(f"reports/{safe_query}_report.txt", "w", encoding="utf-8") as f:
|
|
||||||
f.write(content)
|
|
||||||
print(f"报告已保存到 reports/{safe_query}_report.txt")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
os.makedirs("reports", exist_ok=True)
|
|
||||||
|
|
||||||
if len(sys.argv) < 2:
|
|
||||||
print("使用方法: python pure_main.py '搜索关键词' [force_update]")
|
|
||||||
print("示例: python pure_main.py '人工智能' true")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
force_update = len(sys.argv) > 2 and sys.argv[2].lower() == "true"
|
|
||||||
hunter = PureInfoHunter()
|
|
||||||
|
|
||||||
if force_update:
|
|
||||||
print("强制更新模式(忽略缓存)")
|
|
||||||
data = hunter.crawler.fetch(sys.argv[1], force_update=True)
|
|
||||||
result = hunter.analyzer.analyze(data, sys.argv[1])
|
|
||||||
else:
|
|
||||||
result = hunter.run(sys.argv[1])
|
|
||||||
|
|
||||||
print(result)
|
|
Loading…
x
Reference in New Issue
Block a user