modified: FFAIall/main.py
modified: FFAInobug/__pycache__/crawlers.cpython-313.pyc modified: FFAInobug/crawlers.py modified: FFAInobug/main.py
This commit is contained in:
parent
ac9d058e6a
commit
b04a959a27
@ -1,5 +1,7 @@
|
||||
from crawlers import PureHTMLParser # type: ignore
|
||||
from analyzer import PureAnalyzer # type: ignore
|
||||
import sys
|
||||
import os
|
||||
|
||||
class PureInfoHunter:
|
||||
def __init__(self):
|
||||
@ -37,8 +39,6 @@ class PureInfoHunter:
|
||||
print(f"报告已保存到 reports/{safe_query}_report.txt")
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
import os
|
||||
os.makedirs("reports", exist_ok=True)
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
|
Binary file not shown.
@ -3,8 +3,12 @@ import os
|
||||
import time
|
||||
from urllib.parse import quote
|
||||
from html.parser import HTMLParser
|
||||
import requests # type: ignore
|
||||
from bs4 import BeautifulSoup # type: ignore
|
||||
from urllib.parse import quote_plus
|
||||
|
||||
class PureHTMLParser(HTMLParser):
|
||||
|
||||
# ...(保持之前的HTML解析器代码不变)...
|
||||
|
||||
class PureCrawler:
|
||||
@ -14,6 +18,14 @@ class PureHTMLParser(HTMLParser):
|
||||
self.cache_dir = cache_dir
|
||||
os.makedirs(cache_dir, exist_ok=True)
|
||||
|
||||
def _is_cache_valid(self, cache_file):
|
||||
"""检查缓存是否有效"""
|
||||
if not os.path.exists(cache_file):
|
||||
return False
|
||||
|
||||
file_time = os.path.getmtime(cache_file)
|
||||
return (time.time() - file_time) < self.cache_expiry
|
||||
|
||||
def _get_cache_path(self, query: str) -> str:
|
||||
"""生成缓存文件名"""
|
||||
safe_query = "".join(c if c.isalnum() else "_" for c in query)
|
||||
@ -51,21 +63,49 @@ class PureHTMLParser(HTMLParser):
|
||||
items.append({"url": url, "text": text})
|
||||
return items
|
||||
|
||||
def fetch(self, query: str, force_update=False) -> list:
|
||||
"""优先读取缓存,不存在时爬取"""
|
||||
if not force_update:
|
||||
cached = self._load_from_cache(query)
|
||||
if cached:
|
||||
print("📂 从缓存加载数据")
|
||||
return cached
|
||||
def fetch(self, query, force_update=False):
|
||||
|
||||
print("🌐 正在爬取网络数据...")
|
||||
# ...(保持原有的爬取逻辑)...
|
||||
data = [{"url": link, "text": self.extract_text(link)} for link in self.parser.links[:5]]
|
||||
cache_file = os.path.join(self.cache_dir, f"{quote_plus(query)}.json")
|
||||
|
||||
self._save_to_cache(query, data)
|
||||
# 检查缓存是否有效
|
||||
if not force_update and self._is_cache_valid(cache_file):
|
||||
return self._load_from_cache(cache_file)
|
||||
|
||||
try:
|
||||
# 实际抓取逻辑 - 以百度搜索为例
|
||||
search_url = f"https://www.baidu.com/s?wd={quote_plus(query)}"
|
||||
response = requests.get(search_url, headers=self.headers, timeout=10)
|
||||
response.raise_for_status()
|
||||
|
||||
# 解析网页内容
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
results = [(query)]
|
||||
|
||||
# 提取搜索结果 - 根据实际网站结构调整
|
||||
for item in soup.select('.result.c-container'):
|
||||
title = item.select_one('h3').get_text(strip=True)
|
||||
link = item.find('a')['href']
|
||||
abstract = item.select_one('.c-abstract').get_text(strip=True) if item.select_one('.c-abstract') else ""
|
||||
results.append({
|
||||
'title': title,
|
||||
'url': link,
|
||||
'abstract': abstract
|
||||
})
|
||||
|
||||
data = {
|
||||
'query': query,
|
||||
'results': results,
|
||||
'timestamp': int(time.time()),
|
||||
'sources': [search_url]
|
||||
}
|
||||
|
||||
# 保存到缓存
|
||||
self._save_to_cache(cache_file, data)
|
||||
return data
|
||||
|
||||
def extract_text(self, url: str) -> str:
|
||||
# ...(保持原有的正文提取逻辑)..
|
||||
return extracted_text
|
||||
except Exception as e:
|
||||
# 如果抓取失败但缓存存在,使用缓存
|
||||
if os.path.exists(cache_file):
|
||||
print(f"抓取失败,使用缓存数据: {str(e)}")
|
||||
return self._load_from_cache(cache_file)
|
||||
raise RuntimeError(f"抓取失败且无缓存可用: {str(e)}")
|
@ -41,19 +41,23 @@ if __name__ == "__main__":
|
||||
import os
|
||||
os.makedirs("reports", exist_ok=True)
|
||||
|
||||
# 处理参数缺失的情况
|
||||
if len(sys.argv) < 2:
|
||||
print("使用方法: python pure_main.py '搜索关键词' [force_update]")
|
||||
print("示例: python pure_main.py '人工智能' true")
|
||||
# sys.exit(1)
|
||||
|
||||
query = input("请输入要搜索的关键词: ") # 改为交互式输入
|
||||
force_update = input("是否强制更新(true/false)? ").lower() == "true"
|
||||
else:
|
||||
query = sys.argv[1]
|
||||
force_update = len(sys.argv) > 2 and sys.argv[2].lower() == "true"
|
||||
|
||||
hunter = PureInfoHunter()
|
||||
|
||||
# if force_update:
|
||||
# print("强制更新模式(忽略缓存)")
|
||||
# data = hunter.crawler.fetch(sys.argv[1], force_update=True)
|
||||
# result = hunter.analyzer.analyze(data, sys.argv[1])
|
||||
# else:
|
||||
# result = hunter.run(sys.argv[1])
|
||||
if force_update:
|
||||
print("强制更新模式(忽略缓存)")
|
||||
data = hunter.crawler.fetch(query, force_update=True)
|
||||
result = hunter.analyzer.analyze(data, query)
|
||||
else:
|
||||
result = hunter.run(query)
|
||||
|
||||
# print(result)
|
||||
print(result)
|
Loading…
x
Reference in New Issue
Block a user