FC/FFAI/crawlers.py
Friendfeng 6b7ae8f26e new file: FFAI/__pycache__/catch.cpython-313.pyc
modified:   FFAI/__pycache__/crawlers.cpython-313.pyc
	new file:   FFAI/__pycache__/crawlers_core.cpython-313.pyc
	new file:   缓存文件
	modified:   旧文件
	new file:   爬虫文件
	modified:   主文件
	new file:   cache/cache__E4_BA_BA_E5_B7_A5_E6_99_BA_E8_83_BD_json.txt
	new file:   测试文件
	modified:   readme.md
2025-06-07 09:01:37 +08:00

128 lines
5.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# from typing import Self
# import urllib.request
# import os
# import time
# from urllib.parse import quote
# from html.parser import HTMLParser
# import requests # type: ignore
# from bs4 import BeautifulSoup # type: ignore
# from urllib.parse import quote_plus
# class PureHTMLParser(HTMLParser):
# def __init__(self, cache_dir="cache"):
# self.user_agent = "Mozilla/5.0"
# # self.parser = PureHTMLParser()
# self.cache_dir = cache_dir
# os.makedirs(cache_dir, exist_ok=True)
# def _is_cache_valid(self, cache_file):
# """检查缓存是否有效"""
# if not os.path.exists(cache_file):
# return False
# file_time = os.path.getmtime(cache_file)
# return (time.time() - file_time) < self.cache_expiry
# def _get_cache_path(self, query: str) -> str:
# """生成缓存文件名"""
# safe_query = "".join(c if c.isalnum() else "_" for c in query)
# return f"{self.cache_dir}/{safe_query}.txt"
# def _save_to_cache(self, query: str, data: list):
# """保存搜索结果到缓存(修正版)"""
# cache_file = self._get_cache_path(query)
# try:
# with open(cache_file, "w", encoding="utf-8") as f:
# for item in data:
# # 修正点确保item是字典且包含url键
# url = item.get('url', '') # 安全访问
# text = item.get('text', '')
# f.write(f"URL: {url}\n")
# f.write(f"Text: {text}\n")
# f.write("="*50 + "\n")
# except Exception as e:
# print(f"缓存保存失败: {e}")
# def _load_from_cache(self, query: str) -> list:
# """从缓存加载数据"""
# cache_file = self._get_cache_path(query)
# if not os.path.exists(cache_file):
# return None
# with open(cache_file, "r", encoding="utf-8") as f:
# content = f.read()
# # 解析缓存文件
# items = []
# for block in content.split("="*50):
# if not block.strip():
# continue
# url = text = ""
# for line in block.split("\n"):
# if line.startswith("URL: "):
# url = line[5:]
# elif line.startswith("Text: "):
# text = line[6:]
# if url:
# items.append({"url": url, "text": text})
# return items
# def fetch(self, query, force_update=False):
# # 确保有默认headers
# if not hasattr(self, 'headers'):
# self.headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
# }
# cache_file = os.path.join(self.cache_dir, f"{quote_plus(query)}.json")
# # 检查缓存是否有效
# if not force_update and os.path.exists(cache_file) and self._is_cache_valid(cache_file):
# return self._load_from_cache(cache_file)
# try:
# # 实际抓取
# search_url = f"https://www.baidu.com/s?wd={quote_plus(query)}"
# response = requests.get(search_url, headers=self.headers, timeout=10)
# response.raise_for_status()
# # 解析网页内容
# soup = BeautifulSoup(response.text, 'html.parser')
# results = []
# # 提取搜索结果 - 百度搜索结果的实际选择器可能需要调整
# for item in soup.select('.result.c-container'):
# title_elem = item.select_one('h3')
# link_elem = item.find('a')
# abstract_elem = item.select_one('.c-abstract')
# if title_elem and link_elem:
# results.append({
# 'title': title_elem.get_text(strip=True),
# 'url': link_elem.get('href'),
# 'abstract': abstract_elem.get_text(strip=True) if abstract_elem else ""
# })
# data = {
# 'query': query,
# 'results': results if results else [{'title': '无结果', 'url': '', 'abstract': ''}],
# 'timestamp': int(time.time()),
# 'sources': [search_url]
# }
# # 保存到缓存
# self._save_to_cache(cache_file, data)
# return {
# 'data': data,
# 'sources': ["www.baidu.com"]
# }
# except Exception as e:
# # 如果抓取失败但缓存存在,使用缓存
# if os.path.exists(cache_file):
# print(f"抓取失败,使用缓存数据: {str(e)}")
# return self._load_from_cache(cache_file)
# raise RuntimeError(f"抓取失败且无缓存可用: {str(e)}")