modified: FFAIall/main.py

modified:   FFAInobug/__pycache__/crawlers.cpython-313.pyc
	modified:   FFAInobug/crawlers.py
	modified:   FFAInobug/main.py
This commit is contained in:
Friendfeng 2025-06-07 01:51:02 +08:00
parent ac9d058e6a
commit b04a959a27
4 changed files with 72 additions and 28 deletions

View File

@ -1,5 +1,7 @@
from crawlers import PureHTMLParser # type: ignore from crawlers import PureHTMLParser # type: ignore
from analyzer import PureAnalyzer # type: ignore from analyzer import PureAnalyzer # type: ignore
import sys
import os
class PureInfoHunter: class PureInfoHunter:
def __init__(self): def __init__(self):
@ -37,8 +39,6 @@ class PureInfoHunter:
print(f"报告已保存到 reports/{safe_query}_report.txt") print(f"报告已保存到 reports/{safe_query}_report.txt")
if __name__ == "__main__": if __name__ == "__main__":
import sys
import os
os.makedirs("reports", exist_ok=True) os.makedirs("reports", exist_ok=True)
if len(sys.argv) < 2: if len(sys.argv) < 2:

View File

@ -3,8 +3,12 @@ import os
import time import time
from urllib.parse import quote from urllib.parse import quote
from html.parser import HTMLParser from html.parser import HTMLParser
import requests # type: ignore
from bs4 import BeautifulSoup # type: ignore
from urllib.parse import quote_plus
class PureHTMLParser(HTMLParser): class PureHTMLParser(HTMLParser):
# ...保持之前的HTML解析器代码不变... # ...保持之前的HTML解析器代码不变...
class PureCrawler: class PureCrawler:
@ -14,6 +18,14 @@ class PureHTMLParser(HTMLParser):
self.cache_dir = cache_dir self.cache_dir = cache_dir
os.makedirs(cache_dir, exist_ok=True) os.makedirs(cache_dir, exist_ok=True)
def _is_cache_valid(self, cache_file):
"""检查缓存是否有效"""
if not os.path.exists(cache_file):
return False
file_time = os.path.getmtime(cache_file)
return (time.time() - file_time) < self.cache_expiry
def _get_cache_path(self, query: str) -> str: def _get_cache_path(self, query: str) -> str:
"""生成缓存文件名""" """生成缓存文件名"""
safe_query = "".join(c if c.isalnum() else "_" for c in query) safe_query = "".join(c if c.isalnum() else "_" for c in query)
@ -51,21 +63,49 @@ class PureHTMLParser(HTMLParser):
items.append({"url": url, "text": text}) items.append({"url": url, "text": text})
return items return items
def fetch(self, query: str, force_update=False) -> list: def fetch(self, query, force_update=False):
"""优先读取缓存,不存在时爬取"""
if not force_update: cache_file = os.path.join(self.cache_dir, f"{quote_plus(query)}.json")
cached = self._load_from_cache(query)
if cached:
print("📂 从缓存加载数据")
return cached
print("🌐 正在爬取网络数据...") # 检查缓存是否有效
# ...(保持原有的爬取逻辑)... if not force_update and self._is_cache_valid(cache_file):
data = [{"url": link, "text": self.extract_text(link)} for link in self.parser.links[:5]] return self._load_from_cache(cache_file)
self._save_to_cache(query, data) try:
return data # 实际抓取逻辑 - 以百度搜索为例
search_url = f"https://www.baidu.com/s?wd={quote_plus(query)}"
def extract_text(self, url: str) -> str: response = requests.get(search_url, headers=self.headers, timeout=10)
# ...(保持原有的正文提取逻辑).. response.raise_for_status()
return extracted_text
# 解析网页内容
soup = BeautifulSoup(response.text, 'html.parser')
results = [(query)]
# 提取搜索结果 - 根据实际网站结构调整
for item in soup.select('.result.c-container'):
title = item.select_one('h3').get_text(strip=True)
link = item.find('a')['href']
abstract = item.select_one('.c-abstract').get_text(strip=True) if item.select_one('.c-abstract') else ""
results.append({
'title': title,
'url': link,
'abstract': abstract
})
data = {
'query': query,
'results': results,
'timestamp': int(time.time()),
'sources': [search_url]
}
# 保存到缓存
self._save_to_cache(cache_file, data)
return data
except Exception as e:
# 如果抓取失败但缓存存在,使用缓存
if os.path.exists(cache_file):
print(f"抓取失败,使用缓存数据: {str(e)}")
return self._load_from_cache(cache_file)
raise RuntimeError(f"抓取失败且无缓存可用: {str(e)}")

View File

@ -41,19 +41,23 @@ if __name__ == "__main__":
import os import os
os.makedirs("reports", exist_ok=True) os.makedirs("reports", exist_ok=True)
# 处理参数缺失的情况
if len(sys.argv) < 2: if len(sys.argv) < 2:
print("使用方法: python pure_main.py '搜索关键词' [force_update]") print("使用方法: python pure_main.py '搜索关键词' [force_update]")
print("示例: python pure_main.py '人工智能' true") print("示例: python pure_main.py '人工智能' true")
# sys.exit(1) query = input("请输入要搜索的关键词: ") # 改为交互式输入
force_update = input("是否强制更新(true/false)? ").lower() == "true"
else:
query = sys.argv[1]
force_update = len(sys.argv) > 2 and sys.argv[2].lower() == "true"
force_update = len(sys.argv) > 2 and sys.argv[2].lower() == "true"
hunter = PureInfoHunter() hunter = PureInfoHunter()
# if force_update: if force_update:
# print("强制更新模式(忽略缓存)") print("强制更新模式(忽略缓存)")
# data = hunter.crawler.fetch(sys.argv[1], force_update=True) data = hunter.crawler.fetch(query, force_update=True)
# result = hunter.analyzer.analyze(data, sys.argv[1]) result = hunter.analyzer.analyze(data, query)
# else: else:
# result = hunter.run(sys.argv[1]) result = hunter.run(query)
# print(result) print(result)