renamed: FFAI/__pycache__/catch.cpython-313.pyc -> FFAICilent/__pycache__/catch.cpython-313.pyc renamed: FFAI/__pycache__/crawlers.cpython-313.pyc -> FFAICilent/__pycache__/crawlers.cpython-313.pyc renamed: FFAI/__pycache__/crawlers_core.cpython-313.pyc -> FFAICilent/__pycache__/crawlers_core.cpython-313.pyc renamed: FFAI/analyzer.py -> FFAICilent/analyzer.py renamed: FFAI/catch.py -> FFAICilent/catch.py new file: FFAICilent/cloud.py new file: FFAICilent/config/config.ini new file: FFAICilent/config/configloder.py renamed: FFAI/crawlers.py -> FFAICilent/crawlers.py renamed: FFAI/crawlers_core.py -> FFAICilent/crawlers_core.py new file: FFAICilent/local.py new file: FFAICilent/logger.py renamed: FFAI/main.py -> FFAICilent/main.py new file: FFAICilent/manger.py
128 lines
5.0 KiB
Python
128 lines
5.0 KiB
Python
# from typing import Self
|
||
# import urllib.request
|
||
# import os
|
||
# import time
|
||
# from urllib.parse import quote
|
||
# from html.parser import HTMLParser
|
||
# import requests # type: ignore
|
||
# from bs4 import BeautifulSoup # type: ignore
|
||
# from urllib.parse import quote_plus
|
||
|
||
# class PureHTMLParser(HTMLParser):
|
||
|
||
# def __init__(self, cache_dir="cache"):
|
||
# self.user_agent = "Mozilla/5.0"
|
||
# # self.parser = PureHTMLParser()
|
||
# self.cache_dir = cache_dir
|
||
# os.makedirs(cache_dir, exist_ok=True)
|
||
|
||
# def _is_cache_valid(self, cache_file):
|
||
# """检查缓存是否有效"""
|
||
# if not os.path.exists(cache_file):
|
||
# return False
|
||
|
||
# file_time = os.path.getmtime(cache_file)
|
||
# return (time.time() - file_time) < self.cache_expiry
|
||
|
||
# def _get_cache_path(self, query: str) -> str:
|
||
# """生成缓存文件名"""
|
||
# safe_query = "".join(c if c.isalnum() else "_" for c in query)
|
||
# return f"{self.cache_dir}/{safe_query}.txt"
|
||
|
||
# def _save_to_cache(self, query: str, data: list):
|
||
# """保存搜索结果到缓存(修正版)"""
|
||
# cache_file = self._get_cache_path(query)
|
||
# try:
|
||
# with open(cache_file, "w", encoding="utf-8") as f:
|
||
# for item in data:
|
||
# # 修正点:确保item是字典且包含url键
|
||
# url = item.get('url', '') # 安全访问
|
||
# text = item.get('text', '')
|
||
# f.write(f"URL: {url}\n")
|
||
# f.write(f"Text: {text}\n")
|
||
# f.write("="*50 + "\n")
|
||
# except Exception as e:
|
||
# print(f"缓存保存失败: {e}")
|
||
# def _load_from_cache(self, query: str) -> list:
|
||
# """从缓存加载数据"""
|
||
# cache_file = self._get_cache_path(query)
|
||
# if not os.path.exists(cache_file):
|
||
# return None
|
||
|
||
# with open(cache_file, "r", encoding="utf-8") as f:
|
||
# content = f.read()
|
||
|
||
# # 解析缓存文件
|
||
# items = []
|
||
# for block in content.split("="*50):
|
||
# if not block.strip():
|
||
# continue
|
||
# url = text = ""
|
||
# for line in block.split("\n"):
|
||
# if line.startswith("URL: "):
|
||
# url = line[5:]
|
||
# elif line.startswith("Text: "):
|
||
# text = line[6:]
|
||
# if url:
|
||
# items.append({"url": url, "text": text})
|
||
# return items
|
||
|
||
# def fetch(self, query, force_update=False):
|
||
|
||
|
||
|
||
# # 确保有默认headers
|
||
# if not hasattr(self, 'headers'):
|
||
# self.headers = {
|
||
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||
# }
|
||
|
||
# cache_file = os.path.join(self.cache_dir, f"{quote_plus(query)}.json")
|
||
|
||
# # 检查缓存是否有效
|
||
# if not force_update and os.path.exists(cache_file) and self._is_cache_valid(cache_file):
|
||
# return self._load_from_cache(cache_file)
|
||
|
||
# try:
|
||
# # 实际抓取
|
||
# search_url = f"https://www.baidu.com/s?wd={quote_plus(query)}"
|
||
# response = requests.get(search_url, headers=self.headers, timeout=10)
|
||
# response.raise_for_status()
|
||
|
||
# # 解析网页内容
|
||
# soup = BeautifulSoup(response.text, 'html.parser')
|
||
# results = []
|
||
|
||
# # 提取搜索结果 - 百度搜索结果的实际选择器可能需要调整
|
||
# for item in soup.select('.result.c-container'):
|
||
# title_elem = item.select_one('h3')
|
||
# link_elem = item.find('a')
|
||
# abstract_elem = item.select_one('.c-abstract')
|
||
|
||
# if title_elem and link_elem:
|
||
# results.append({
|
||
# 'title': title_elem.get_text(strip=True),
|
||
# 'url': link_elem.get('href'),
|
||
# 'abstract': abstract_elem.get_text(strip=True) if abstract_elem else ""
|
||
# })
|
||
|
||
# data = {
|
||
# 'query': query,
|
||
# 'results': results if results else [{'title': '无结果', 'url': '', 'abstract': ''}],
|
||
# 'timestamp': int(time.time()),
|
||
# 'sources': [search_url]
|
||
# }
|
||
|
||
# # 保存到缓存
|
||
# self._save_to_cache(cache_file, data)
|
||
# return {
|
||
# 'data': data,
|
||
# 'sources': ["www.baidu.com"]
|
||
# }
|
||
|
||
# except Exception as e:
|
||
# # 如果抓取失败但缓存存在,使用缓存
|
||
# if os.path.exists(cache_file):
|
||
# print(f"抓取失败,使用缓存数据: {str(e)}")
|
||
# return self._load_from_cache(cache_file)
|
||
# raise RuntimeError(f"抓取失败且无缓存可用: {str(e)}") |