modified: FFAInobug/crawlers.py

modified:   FFAInobug/main.py
This commit is contained in:
Friendfeng 2025-06-07 02:41:28 +08:00
parent b04a959a27
commit b1652bc1c0
2 changed files with 28 additions and 20 deletions

View File

@ -10,11 +10,9 @@ from urllib.parse import quote_plus
class PureHTMLParser(HTMLParser):
# ...保持之前的HTML解析器代码不变...
class PureCrawler:
def __init__(self, cache_dir="cache"):
self.user_agent = "Mozilla/5.0"
self.parser = PureHTMLParser()
# self.parser = PureHTMLParser()
self.cache_dir = cache_dir
os.makedirs(cache_dir, exist_ok=True)
@ -35,8 +33,8 @@ class PureHTMLParser(HTMLParser):
"""保存搜索结果到缓存"""
with open(self._get_cache_path(query), "w", encoding="utf-8") as f:
for item in data:
f.write(f"URL: {item['url']}\n")
f.write(f"Text: {item['text']}\n")
f.write(f"URL: {item['url','']}\n")
f.write(f"Text: {'abstract', item.get('text', '')}\n")
f.write("="*50 + "\n")
def _load_from_cache(self, query: str) -> list:
@ -64,11 +62,16 @@ class PureHTMLParser(HTMLParser):
return items
def fetch(self, query, force_update=False):
# 确保有默认headers
if not hasattr(self, 'headers'):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
cache_file = os.path.join(self.cache_dir, f"{quote_plus(query)}.json")
# 检查缓存是否有效
if not force_update and self._is_cache_valid(cache_file):
if not force_update and os.path.exists(cache_file) and self._is_cache_valid(cache_file):
return self._load_from_cache(cache_file)
try:
@ -79,29 +82,34 @@ class PureHTMLParser(HTMLParser):
# 解析网页内容
soup = BeautifulSoup(response.text, 'html.parser')
results = [(query)]
results = []
# 提取搜索结果 - 根据实际网站结构调整
# 提取搜索结果 - 百度搜索结果的实际选择器可能需要调整
for item in soup.select('.result.c-container'):
title = item.select_one('h3').get_text(strip=True)
link = item.find('a')['href']
abstract = item.select_one('.c-abstract').get_text(strip=True) if item.select_one('.c-abstract') else ""
results.append({
'title': title,
'url': link,
'abstract': abstract
})
title_elem = item.select_one('h3')
link_elem = item.find('a')
abstract_elem = item.select_one('.c-abstract')
if title_elem and link_elem:
results.append({
'title': title_elem.get_text(strip=True),
'url': link_elem.get('href'),
'abstract': abstract_elem.get_text(strip=True) if abstract_elem else ""
})
data = {
'query': query,
'results': results,
'results': results if results else [{'title': '无结果', 'url': '', 'abstract': ''}],
'timestamp': int(time.time()),
'sources': [search_url]
}
# 保存到缓存
self._save_to_cache(cache_file, data)
return data
return {
'data': data,
'sources': ["www.baidu.com"]
}
except Exception as e:
# 如果抓取失败但缓存存在,使用缓存

View File

@ -55,7 +55,7 @@ if __name__ == "__main__":
if force_update:
print("强制更新模式(忽略缓存)")
data = hunter.crawler.fetch(query, force_update=True)
data = hunter.crawler.fetch(query) # 使用实际存在的方法名
result = hunter.analyzer.analyze(data, query)
else:
result = hunter.run(query)