FC/FFAInobug/crawlers.py
Friendfeng b04a959a27 modified: FFAIall/main.py
modified:   FFAInobug/__pycache__/crawlers.cpython-313.pyc
	modified:   FFAInobug/crawlers.py
	modified:   FFAInobug/main.py
2025-06-07 01:51:02 +08:00

111 lines
4.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import urllib.request
import os
import time
from urllib.parse import quote
from html.parser import HTMLParser
import requests # type: ignore
from bs4 import BeautifulSoup # type: ignore
from urllib.parse import quote_plus
class PureHTMLParser(HTMLParser):
# ...保持之前的HTML解析器代码不变...
class PureCrawler:
def __init__(self, cache_dir="cache"):
self.user_agent = "Mozilla/5.0"
self.parser = PureHTMLParser()
self.cache_dir = cache_dir
os.makedirs(cache_dir, exist_ok=True)
def _is_cache_valid(self, cache_file):
"""检查缓存是否有效"""
if not os.path.exists(cache_file):
return False
file_time = os.path.getmtime(cache_file)
return (time.time() - file_time) < self.cache_expiry
def _get_cache_path(self, query: str) -> str:
"""生成缓存文件名"""
safe_query = "".join(c if c.isalnum() else "_" for c in query)
return f"{self.cache_dir}/{safe_query}.txt"
def _save_to_cache(self, query: str, data: list):
"""保存搜索结果到缓存"""
with open(self._get_cache_path(query), "w", encoding="utf-8") as f:
for item in data:
f.write(f"URL: {item['url']}\n")
f.write(f"Text: {item['text']}\n")
f.write("="*50 + "\n")
def _load_from_cache(self, query: str) -> list:
"""从缓存加载数据"""
cache_file = self._get_cache_path(query)
if not os.path.exists(cache_file):
return None
with open(cache_file, "r", encoding="utf-8") as f:
content = f.read()
# 解析缓存文件
items = []
for block in content.split("="*50):
if not block.strip():
continue
url = text = ""
for line in block.split("\n"):
if line.startswith("URL: "):
url = line[5:]
elif line.startswith("Text: "):
text = line[6:]
if url:
items.append({"url": url, "text": text})
return items
def fetch(self, query, force_update=False):
cache_file = os.path.join(self.cache_dir, f"{quote_plus(query)}.json")
# 检查缓存是否有效
if not force_update and self._is_cache_valid(cache_file):
return self._load_from_cache(cache_file)
try:
# 实际抓取逻辑 - 以百度搜索为例
search_url = f"https://www.baidu.com/s?wd={quote_plus(query)}"
response = requests.get(search_url, headers=self.headers, timeout=10)
response.raise_for_status()
# 解析网页内容
soup = BeautifulSoup(response.text, 'html.parser')
results = [(query)]
# 提取搜索结果 - 根据实际网站结构调整
for item in soup.select('.result.c-container'):
title = item.select_one('h3').get_text(strip=True)
link = item.find('a')['href']
abstract = item.select_one('.c-abstract').get_text(strip=True) if item.select_one('.c-abstract') else ""
results.append({
'title': title,
'url': link,
'abstract': abstract
})
data = {
'query': query,
'results': results,
'timestamp': int(time.time()),
'sources': [search_url]
}
# 保存到缓存
self._save_to_cache(cache_file, data)
return data
except Exception as e:
# 如果抓取失败但缓存存在,使用缓存
if os.path.exists(cache_file):
print(f"抓取失败,使用缓存数据: {str(e)}")
return self._load_from_cache(cache_file)
raise RuntimeError(f"抓取失败且无缓存可用: {str(e)}")