import urllib.request import os import time from urllib.parse import quote from html.parser import HTMLParser class PureHTMLParser(HTMLParser): # ...(保持之前的HTML解析器代码不变)... class PureCrawler: def __init__(self, cache_dir="cache"): self.user_agent = "Mozilla/5.0" self.parser = PureHTMLParser() self.cache_dir = cache_dir os.makedirs(cache_dir, exist_ok=True) def _get_cache_path(self, query: str) -> str: """生成缓存文件名""" safe_query = "".join(c if c.isalnum() else "_" for c in query) return f"{self.cache_dir}/{safe_query}.txt" def _save_to_cache(self, query: str, data: list): """保存搜索结果到缓存""" with open(self._get_cache_path(query), "w", encoding="utf-8") as f: for item in data: f.write(f"URL: {item['url']}\n") f.write(f"Text: {item['text']}\n") f.write("="*50 + "\n") def _load_from_cache(self, query: str) -> list: """从缓存加载数据""" cache_file = self._get_cache_path(query) if not os.path.exists(cache_file): return None with open(cache_file, "r", encoding="utf-8") as f: content = f.read() # 解析缓存文件 items = [] for block in content.split("="*50): if not block.strip(): continue url = text = "" for line in block.split("\n"): if line.startswith("URL: "): url = line[5:] elif line.startswith("Text: "): text = line[6:] if url: items.append({"url": url, "text": text}) return items def fetch(self, query: str, force_update=False) -> list: """优先读取缓存,不存在时爬取""" if not force_update: cached = self._load_from_cache(query) if cached: print("📂 从缓存加载数据") return cached print("🌐 正在爬取网络数据...") # ...(保持原有的爬取逻辑)... data = [{"url": link, "text": self.extract_text(link)} for link in self.parser.links[:5]] self._save_to_cache(query, data) return data def extract_text(self, url: str) -> str: # ...(保持原有的正文提取逻辑).. return extracted_text