71 lines
2.4 KiB
Python
71 lines
2.4 KiB
Python
|
import urllib.request
|
|||
|
import os
|
|||
|
import time
|
|||
|
from urllib.parse import quote
|
|||
|
from html.parser import HTMLParser
|
|||
|
|
|||
|
class PureHTMLParser(HTMLParser):
|
|||
|
# ...(保持之前的HTML解析器代码不变)...
|
|||
|
|
|||
|
class PureCrawler:
|
|||
|
def __init__(self, cache_dir="cache"):
|
|||
|
self.user_agent = "Mozilla/5.0"
|
|||
|
self.parser = PureHTMLParser()
|
|||
|
self.cache_dir = cache_dir
|
|||
|
os.makedirs(cache_dir, exist_ok=True)
|
|||
|
|
|||
|
def _get_cache_path(self, query: str) -> str:
|
|||
|
"""生成缓存文件名"""
|
|||
|
safe_query = "".join(c if c.isalnum() else "_" for c in query)
|
|||
|
return f"{self.cache_dir}/{safe_query}.txt"
|
|||
|
|
|||
|
def _save_to_cache(self, query: str, data: list):
|
|||
|
"""保存搜索结果到缓存"""
|
|||
|
with open(self._get_cache_path(query), "w", encoding="utf-8") as f:
|
|||
|
for item in data:
|
|||
|
f.write(f"URL: {item['url']}\n")
|
|||
|
f.write(f"Text: {item['text']}\n")
|
|||
|
f.write("="*50 + "\n")
|
|||
|
|
|||
|
def _load_from_cache(self, query: str) -> list:
|
|||
|
"""从缓存加载数据"""
|
|||
|
cache_file = self._get_cache_path(query)
|
|||
|
if not os.path.exists(cache_file):
|
|||
|
return None
|
|||
|
|
|||
|
with open(cache_file, "r", encoding="utf-8") as f:
|
|||
|
content = f.read()
|
|||
|
|
|||
|
# 解析缓存文件
|
|||
|
items = []
|
|||
|
for block in content.split("="*50):
|
|||
|
if not block.strip():
|
|||
|
continue
|
|||
|
url = text = ""
|
|||
|
for line in block.split("\n"):
|
|||
|
if line.startswith("URL: "):
|
|||
|
url = line[5:]
|
|||
|
elif line.startswith("Text: "):
|
|||
|
text = line[6:]
|
|||
|
if url:
|
|||
|
items.append({"url": url, "text": text})
|
|||
|
return items
|
|||
|
|
|||
|
def fetch(self, query: str, force_update=False) -> list:
|
|||
|
"""优先读取缓存,不存在时爬取"""
|
|||
|
if not force_update:
|
|||
|
cached = self._load_from_cache(query)
|
|||
|
if cached:
|
|||
|
print("📂 从缓存加载数据")
|
|||
|
return cached
|
|||
|
|
|||
|
print("🌐 正在爬取网络数据...")
|
|||
|
# ...(保持原有的爬取逻辑)...
|
|||
|
data = [{"url": link, "text": self.extract_text(link)} for link in self.parser.links[:5]]
|
|||
|
|
|||
|
self._save_to_cache(query, data)
|
|||
|
return data
|
|||
|
|
|||
|
def extract_text(self, url: str) -> str:
|
|||
|
# ...(保持原有的正文提取逻辑)..
|
|||
|
return extracted_text
|