FC/FFAI/crawlers_core.py
Friendfeng 6b7ae8f26e new file: FFAI/__pycache__/catch.cpython-313.pyc
modified:   FFAI/__pycache__/crawlers.cpython-313.pyc
	new file:   FFAI/__pycache__/crawlers_core.cpython-313.pyc
	new file:   缓存文件
	modified:   旧文件
	new file:   爬虫文件
	modified:   主文件
	new file:   cache/cache__E4_BA_BA_E5_B7_A5_E6_99_BA_E8_83_BD_json.txt
	new file:   测试文件
	modified:   readme.md
2025-06-07 09:01:37 +08:00

102 lines
3.7 KiB
Python

import urllib.request
import urllib.robotparser
from urllib.parse import urlparse
import time
from bs4 import BeautifulSoup
class CrawlerEngine:
def __init__(self, cache_manager):
self.cache = cache_manager
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
'Accept-Language': 'zh-CN,zh;q=0.9'
}
self.delay = 2 # 爬取延迟(秒)
def _can_fetch(self, url) -> bool:
"""检查robots.txt权限"""
try:
parsed = urlparse(url)
base_url = f"{parsed.scheme}://{parsed.netloc}"
rp = urllib.robotparser.RobotFileParser()
rp.set_url(f"{base_url}/robots.txt")
rp.read()
return rp.can_fetch(self.headers['User-Agent'], url)
except:
return True
def _fetch_html(self, url) -> str:
"""安全获取网页内容"""
if not self._can_fetch(url):
raise PermissionError(f"无权限爬取: {url}")
req = urllib.request.Request(url, headers=self.headers)
try:
with urllib.request.urlopen(req, timeout=10) as response:
if response.status == 200:
return response.read().decode('utf-8')
raise ConnectionError(f"HTTP {response.status}")
except Exception as e:
raise ConnectionError(f"获取失败: {url} - {str(e)}")
def _extract_content(self, html: str) -> dict:
"""从HTML提取结构化数据"""
soup = BeautifulSoup(html, 'html.parser')
# 移除不需要的标签
for tag in ['script', 'style', 'nav', 'footer']:
for element in soup(tag):
element.decompose()
# 提取核心内容
title = soup.title.string if soup.title else ''
text = ' '.join(p.get_text() for p in soup.find_all('p'))
return {
'title': title.strip(),
'content': text.strip(),
'links': [a['href'] for a in soup.find_all('a', href=True)]
}
def crawl(self, query: str, max_results=5) -> dict:
"""执行完整爬取流程"""
# 先检查缓存
cached = self.cache.load_from_cache(query)
if cached:
print(f"使用缓存数据: {query}")
return cached
print(f"开始爬取: {query}")
results = []
try:
# 模拟搜索引擎查询(示例使用百度)
search_url = f"https://www.baidu.com/s?wd={urllib.parse.quote(query)}"
html = self._fetch_html(search_url)
data = self._extract_content(html)
# 限制抓取数量并添加延迟
for link in data['links'][:max_results]:
if link.startswith('http'):
try:
page_html = self._fetch_html(link)
page_data = self._extract_content(page_html)
results.append({
'source_url': link,
'title': page_data['title'],
'content': page_data['content']
})
time.sleep(self.delay)
except Exception as e:
print(f"子页面抓取失败: {link} - {str(e)}")
# 保存结果到缓存
result_data = {'query': query, 'results': results}
self.cache.save_to_cache(query, result_data)
return result_data
except Exception as e:
print(f"爬取失败: {str(e)}")
if cached:
return cached
raise RuntimeError(f"爬取失败且无缓存可用: {str(e)}")