2025-06-06 23:33:48 +08:00
|
|
|
|
import urllib.request
|
|
|
|
|
import os
|
|
|
|
|
import time
|
|
|
|
|
from urllib.parse import quote
|
|
|
|
|
from html.parser import HTMLParser
|
2025-06-07 01:51:02 +08:00
|
|
|
|
import requests # type: ignore
|
|
|
|
|
from bs4 import BeautifulSoup # type: ignore
|
|
|
|
|
from urllib.parse import quote_plus
|
2025-06-06 23:33:48 +08:00
|
|
|
|
|
|
|
|
|
class PureHTMLParser(HTMLParser):
|
2025-06-07 01:51:02 +08:00
|
|
|
|
|
2025-06-06 23:33:48 +08:00
|
|
|
|
# ...(保持之前的HTML解析器代码不变)...
|
|
|
|
|
|
|
|
|
|
class PureCrawler:
|
|
|
|
|
def __init__(self, cache_dir="cache"):
|
|
|
|
|
self.user_agent = "Mozilla/5.0"
|
|
|
|
|
self.parser = PureHTMLParser()
|
|
|
|
|
self.cache_dir = cache_dir
|
|
|
|
|
os.makedirs(cache_dir, exist_ok=True)
|
|
|
|
|
|
2025-06-07 01:51:02 +08:00
|
|
|
|
def _is_cache_valid(self, cache_file):
|
|
|
|
|
"""检查缓存是否有效"""
|
|
|
|
|
if not os.path.exists(cache_file):
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
file_time = os.path.getmtime(cache_file)
|
|
|
|
|
return (time.time() - file_time) < self.cache_expiry
|
|
|
|
|
|
2025-06-06 23:33:48 +08:00
|
|
|
|
def _get_cache_path(self, query: str) -> str:
|
|
|
|
|
"""生成缓存文件名"""
|
|
|
|
|
safe_query = "".join(c if c.isalnum() else "_" for c in query)
|
|
|
|
|
return f"{self.cache_dir}/{safe_query}.txt"
|
|
|
|
|
|
|
|
|
|
def _save_to_cache(self, query: str, data: list):
|
|
|
|
|
"""保存搜索结果到缓存"""
|
|
|
|
|
with open(self._get_cache_path(query), "w", encoding="utf-8") as f:
|
|
|
|
|
for item in data:
|
|
|
|
|
f.write(f"URL: {item['url']}\n")
|
|
|
|
|
f.write(f"Text: {item['text']}\n")
|
|
|
|
|
f.write("="*50 + "\n")
|
|
|
|
|
|
|
|
|
|
def _load_from_cache(self, query: str) -> list:
|
|
|
|
|
"""从缓存加载数据"""
|
|
|
|
|
cache_file = self._get_cache_path(query)
|
|
|
|
|
if not os.path.exists(cache_file):
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
with open(cache_file, "r", encoding="utf-8") as f:
|
|
|
|
|
content = f.read()
|
|
|
|
|
|
|
|
|
|
# 解析缓存文件
|
|
|
|
|
items = []
|
|
|
|
|
for block in content.split("="*50):
|
|
|
|
|
if not block.strip():
|
|
|
|
|
continue
|
|
|
|
|
url = text = ""
|
|
|
|
|
for line in block.split("\n"):
|
|
|
|
|
if line.startswith("URL: "):
|
|
|
|
|
url = line[5:]
|
|
|
|
|
elif line.startswith("Text: "):
|
|
|
|
|
text = line[6:]
|
|
|
|
|
if url:
|
|
|
|
|
items.append({"url": url, "text": text})
|
|
|
|
|
return items
|
|
|
|
|
|
2025-06-07 01:51:02 +08:00
|
|
|
|
def fetch(self, query, force_update=False):
|
|
|
|
|
|
|
|
|
|
cache_file = os.path.join(self.cache_dir, f"{quote_plus(query)}.json")
|
2025-06-06 23:33:48 +08:00
|
|
|
|
|
2025-06-07 01:51:02 +08:00
|
|
|
|
# 检查缓存是否有效
|
|
|
|
|
if not force_update and self._is_cache_valid(cache_file):
|
|
|
|
|
return self._load_from_cache(cache_file)
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# 实际抓取逻辑 - 以百度搜索为例
|
|
|
|
|
search_url = f"https://www.baidu.com/s?wd={quote_plus(query)}"
|
|
|
|
|
response = requests.get(search_url, headers=self.headers, timeout=10)
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
|
|
|
|
|
# 解析网页内容
|
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
|
results = [(query)]
|
|
|
|
|
|
|
|
|
|
# 提取搜索结果 - 根据实际网站结构调整
|
|
|
|
|
for item in soup.select('.result.c-container'):
|
|
|
|
|
title = item.select_one('h3').get_text(strip=True)
|
|
|
|
|
link = item.find('a')['href']
|
|
|
|
|
abstract = item.select_one('.c-abstract').get_text(strip=True) if item.select_one('.c-abstract') else ""
|
|
|
|
|
results.append({
|
|
|
|
|
'title': title,
|
|
|
|
|
'url': link,
|
|
|
|
|
'abstract': abstract
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
data = {
|
|
|
|
|
'query': query,
|
|
|
|
|
'results': results,
|
|
|
|
|
'timestamp': int(time.time()),
|
|
|
|
|
'sources': [search_url]
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# 保存到缓存
|
|
|
|
|
self._save_to_cache(cache_file, data)
|
|
|
|
|
return data
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
# 如果抓取失败但缓存存在,使用缓存
|
|
|
|
|
if os.path.exists(cache_file):
|
|
|
|
|
print(f"抓取失败,使用缓存数据: {str(e)}")
|
|
|
|
|
return self._load_from_cache(cache_file)
|
|
|
|
|
raise RuntimeError(f"抓取失败且无缓存可用: {str(e)}")
|