FC/FFAIall(暂停)/crawlers.py
Friendfeng 583e7574ee renamed: FFAIall/__pycache__/analyzer.cpython-313.pyc -> FFAI/__pycache__/analyzer.cpython-313.pyc
renamed:    FFAInobug/__pycache__/crawlers.cpython-313.pyc -> FFAI/__pycache__/crawlers.cpython-313.pyc
	renamed:    FFAIall/analyzer.py -> FFAI/analyzer.py
	renamed:    FFAInobug/crawlers.py -> FFAI/crawlers.py
	renamed:    FFAInobug/main.py -> FFAI/main.py
	renamed:    FFAInobug/__pycache__/analyzer.cpython-313.pyc -> "FFAIall\357\274\210\346\232\202\345\201\234\357\274\211/__pycache__/analyzer.cpython-313.pyc"
	renamed:    FFAIall/__pycache__/crawlers.cpython-313.pyc -> "FFAIall\357\274\210\346\232\202\345\201\234\357\274\211/__pycache__/crawlers.cpython-313.pyc"
	renamed:    FFAInobug/analyzer.py -> "FFAIall\357\274\210\346\232\202\345\201\234\357\274\211/analyzer.py"
	renamed:    FFAIall/crawlers.py -> "FFAIall\357\274\210\346\232\202\345\201\234\357\274\211/crawlers.py"
	renamed:    FFAIall/main.py -> "FFAIall\357\274\210\346\232\202\345\201\234\357\274\211/main.py"
	modified:   readme.md
2025-06-07 02:47:58 +08:00

71 lines
2.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import urllib.request
import os
import time
from urllib.parse import quote
from html.parser import HTMLParser
class PureHTMLParser(HTMLParser):
# ...保持之前的HTML解析器代码不变...
class PureCrawler:
def __init__(self, cache_dir="cache"):
self.user_agent = "Mozilla/5.0"
self.parser = PureHTMLParser()
self.cache_dir = cache_dir
os.makedirs(cache_dir, exist_ok=True)
def _get_cache_path(self, query: str) -> str:
"""生成缓存文件名"""
safe_query = "".join(c if c.isalnum() else "_" for c in query)
return f"{self.cache_dir}/{safe_query}.txt"
def _save_to_cache(self, query: str, data: list):
"""保存搜索结果到缓存"""
with open(self._get_cache_path(query), "w", encoding="utf-8") as f:
for item in data:
f.write(f"URL: {item['url']}\n")
f.write(f"Text: {item['text']}\n")
f.write("="*50 + "\n")
def _load_from_cache(self, query: str) -> list:
"""从缓存加载数据"""
cache_file = self._get_cache_path(query)
if not os.path.exists(cache_file):
return None
with open(cache_file, "r", encoding="utf-8") as f:
content = f.read()
# 解析缓存文件
items = []
for block in content.split("="*50):
if not block.strip():
continue
url = text = ""
for line in block.split("\n"):
if line.startswith("URL: "):
url = line[5:]
elif line.startswith("Text: "):
text = line[6:]
if url:
items.append({"url": url, "text": text})
return items
def fetch(self, query: str, force_update=False) -> list:
"""优先读取缓存,不存在时爬取"""
if not force_update:
cached = self._load_from_cache(query)
if cached:
print("📂 从缓存加载数据")
return cached
print("🌐 正在爬取网络数据...")
# ...(保持原有的爬取逻辑)...
data = [{"url": link, "text": self.extract_text(link)} for link in self.parser.links[:5]]
self._save_to_cache(query, data)
return data
def extract_text(self, url: str) -> str:
# ...(保持原有的正文提取逻辑)..
return extracted_text