modified: FFAIall/main.py
modified: FFAInobug/__pycache__/crawlers.cpython-313.pyc modified: FFAInobug/crawlers.py modified: FFAInobug/main.py
This commit is contained in:
parent
ac9d058e6a
commit
b04a959a27
@ -1,5 +1,7 @@
|
|||||||
from crawlers import PureHTMLParser # type: ignore
|
from crawlers import PureHTMLParser # type: ignore
|
||||||
from analyzer import PureAnalyzer # type: ignore
|
from analyzer import PureAnalyzer # type: ignore
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
class PureInfoHunter:
|
class PureInfoHunter:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@ -37,8 +39,6 @@ class PureInfoHunter:
|
|||||||
print(f"报告已保存到 reports/{safe_query}_report.txt")
|
print(f"报告已保存到 reports/{safe_query}_report.txt")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
os.makedirs("reports", exist_ok=True)
|
os.makedirs("reports", exist_ok=True)
|
||||||
|
|
||||||
if len(sys.argv) < 2:
|
if len(sys.argv) < 2:
|
||||||
|
Binary file not shown.
@ -3,8 +3,12 @@ import os
|
|||||||
import time
|
import time
|
||||||
from urllib.parse import quote
|
from urllib.parse import quote
|
||||||
from html.parser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
|
import requests # type: ignore
|
||||||
|
from bs4 import BeautifulSoup # type: ignore
|
||||||
|
from urllib.parse import quote_plus
|
||||||
|
|
||||||
class PureHTMLParser(HTMLParser):
|
class PureHTMLParser(HTMLParser):
|
||||||
|
|
||||||
# ...(保持之前的HTML解析器代码不变)...
|
# ...(保持之前的HTML解析器代码不变)...
|
||||||
|
|
||||||
class PureCrawler:
|
class PureCrawler:
|
||||||
@ -14,6 +18,14 @@ class PureHTMLParser(HTMLParser):
|
|||||||
self.cache_dir = cache_dir
|
self.cache_dir = cache_dir
|
||||||
os.makedirs(cache_dir, exist_ok=True)
|
os.makedirs(cache_dir, exist_ok=True)
|
||||||
|
|
||||||
|
def _is_cache_valid(self, cache_file):
|
||||||
|
"""检查缓存是否有效"""
|
||||||
|
if not os.path.exists(cache_file):
|
||||||
|
return False
|
||||||
|
|
||||||
|
file_time = os.path.getmtime(cache_file)
|
||||||
|
return (time.time() - file_time) < self.cache_expiry
|
||||||
|
|
||||||
def _get_cache_path(self, query: str) -> str:
|
def _get_cache_path(self, query: str) -> str:
|
||||||
"""生成缓存文件名"""
|
"""生成缓存文件名"""
|
||||||
safe_query = "".join(c if c.isalnum() else "_" for c in query)
|
safe_query = "".join(c if c.isalnum() else "_" for c in query)
|
||||||
@ -51,21 +63,49 @@ class PureHTMLParser(HTMLParser):
|
|||||||
items.append({"url": url, "text": text})
|
items.append({"url": url, "text": text})
|
||||||
return items
|
return items
|
||||||
|
|
||||||
def fetch(self, query: str, force_update=False) -> list:
|
def fetch(self, query, force_update=False):
|
||||||
"""优先读取缓存,不存在时爬取"""
|
|
||||||
if not force_update:
|
cache_file = os.path.join(self.cache_dir, f"{quote_plus(query)}.json")
|
||||||
cached = self._load_from_cache(query)
|
|
||||||
if cached:
|
|
||||||
print("📂 从缓存加载数据")
|
|
||||||
return cached
|
|
||||||
|
|
||||||
print("🌐 正在爬取网络数据...")
|
# 检查缓存是否有效
|
||||||
# ...(保持原有的爬取逻辑)...
|
if not force_update and self._is_cache_valid(cache_file):
|
||||||
data = [{"url": link, "text": self.extract_text(link)} for link in self.parser.links[:5]]
|
return self._load_from_cache(cache_file)
|
||||||
|
|
||||||
self._save_to_cache(query, data)
|
try:
|
||||||
return data
|
# 实际抓取逻辑 - 以百度搜索为例
|
||||||
|
search_url = f"https://www.baidu.com/s?wd={quote_plus(query)}"
|
||||||
def extract_text(self, url: str) -> str:
|
response = requests.get(search_url, headers=self.headers, timeout=10)
|
||||||
# ...(保持原有的正文提取逻辑)..
|
response.raise_for_status()
|
||||||
return extracted_text
|
|
||||||
|
# 解析网页内容
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
results = [(query)]
|
||||||
|
|
||||||
|
# 提取搜索结果 - 根据实际网站结构调整
|
||||||
|
for item in soup.select('.result.c-container'):
|
||||||
|
title = item.select_one('h3').get_text(strip=True)
|
||||||
|
link = item.find('a')['href']
|
||||||
|
abstract = item.select_one('.c-abstract').get_text(strip=True) if item.select_one('.c-abstract') else ""
|
||||||
|
results.append({
|
||||||
|
'title': title,
|
||||||
|
'url': link,
|
||||||
|
'abstract': abstract
|
||||||
|
})
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'query': query,
|
||||||
|
'results': results,
|
||||||
|
'timestamp': int(time.time()),
|
||||||
|
'sources': [search_url]
|
||||||
|
}
|
||||||
|
|
||||||
|
# 保存到缓存
|
||||||
|
self._save_to_cache(cache_file, data)
|
||||||
|
return data
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# 如果抓取失败但缓存存在,使用缓存
|
||||||
|
if os.path.exists(cache_file):
|
||||||
|
print(f"抓取失败,使用缓存数据: {str(e)}")
|
||||||
|
return self._load_from_cache(cache_file)
|
||||||
|
raise RuntimeError(f"抓取失败且无缓存可用: {str(e)}")
|
@ -41,19 +41,23 @@ if __name__ == "__main__":
|
|||||||
import os
|
import os
|
||||||
os.makedirs("reports", exist_ok=True)
|
os.makedirs("reports", exist_ok=True)
|
||||||
|
|
||||||
|
# 处理参数缺失的情况
|
||||||
if len(sys.argv) < 2:
|
if len(sys.argv) < 2:
|
||||||
print("使用方法: python pure_main.py '搜索关键词' [force_update]")
|
print("使用方法: python pure_main.py '搜索关键词' [force_update]")
|
||||||
print("示例: python pure_main.py '人工智能' true")
|
print("示例: python pure_main.py '人工智能' true")
|
||||||
# sys.exit(1)
|
query = input("请输入要搜索的关键词: ") # 改为交互式输入
|
||||||
|
force_update = input("是否强制更新(true/false)? ").lower() == "true"
|
||||||
|
else:
|
||||||
|
query = sys.argv[1]
|
||||||
|
force_update = len(sys.argv) > 2 and sys.argv[2].lower() == "true"
|
||||||
|
|
||||||
force_update = len(sys.argv) > 2 and sys.argv[2].lower() == "true"
|
|
||||||
hunter = PureInfoHunter()
|
hunter = PureInfoHunter()
|
||||||
|
|
||||||
# if force_update:
|
if force_update:
|
||||||
# print("强制更新模式(忽略缓存)")
|
print("强制更新模式(忽略缓存)")
|
||||||
# data = hunter.crawler.fetch(sys.argv[1], force_update=True)
|
data = hunter.crawler.fetch(query, force_update=True)
|
||||||
# result = hunter.analyzer.analyze(data, sys.argv[1])
|
result = hunter.analyzer.analyze(data, query)
|
||||||
# else:
|
else:
|
||||||
# result = hunter.run(sys.argv[1])
|
result = hunter.run(query)
|
||||||
|
|
||||||
# print(result)
|
print(result)
|
Loading…
x
Reference in New Issue
Block a user