modified: FFAI/__pycache__/crawlers_core.cpython-313.pyc

modified:   FFAI/crawlers_core.py
This commit is contained in:
Friendfeng 2025-06-07 10:12:47 +08:00
parent 6b7ae8f26e
commit a7abe9d506
2 changed files with 84 additions and 3 deletions

View File

@ -3,6 +3,9 @@ import urllib.robotparser
from urllib.parse import urlparse from urllib.parse import urlparse
import time import time
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import random
from urllib.parse import quote
from fake_useragent import UserAgent
class CrawlerEngine: class CrawlerEngine:
def __init__(self, cache_manager): def __init__(self, cache_manager):
@ -11,7 +14,14 @@ class CrawlerEngine:
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
'Accept-Language': 'zh-CN,zh;q=0.9' 'Accept-Language': 'zh-CN,zh;q=0.9'
} }
self.delay = 2 # 爬取延迟(秒) self.ua = UserAgent()
self.search_engines = [
"https://www.baidu.com/s?wd={}",
"https://www.sogou.com/web?query={}",
"https://cn.bing.com/search?q={}"
]
self.delay_range = (2, 5) # 随机延迟秒数
def _can_fetch(self, url) -> bool: def _can_fetch(self, url) -> bool:
"""检查robots.txt权限""" """检查robots.txt权限"""
@ -24,7 +34,78 @@ class CrawlerEngine:
return rp.can_fetch(self.headers['User-Agent'], url) return rp.can_fetch(self.headers['User-Agent'], url)
except: except:
return True return True
def _get_random_header(self):
return {
'User-Agent': self.ua.random,
'Accept': 'text/html,application/xhtml+xml',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Referer': 'https://www.google.com/'
}
def _smart_delay(self):
time.sleep(random.uniform(*self.delay_range))
def _bypass_anti_spider(self, url):
"""智能绕过反爬策略"""
try:
req = urllib.request.Request(
url,
headers=self._get_random_header(),
method='GET'
)
# 添加代理支持(可选)
# proxy = random.choice(proxies)
# req.set_proxy(proxy, 'http')
with urllib.request.urlopen(req, timeout=15) as response:
if response.status == 200:
return response.read().decode('utf-8', errors='ignore')
return None
except Exception:
return None
def crawl(self, query, max_retries=3):
"""增强版爬取方法"""
cached = self.cache.load_from_cache(query)
if cached:
return cached
for attempt in range(max_retries):
try:
search_url = random.choice(self.search_engines).format(quote(query))
print(f"尝试爬取: {search_url} (第{attempt+1}次)")
html = self._bypass_anti_spider(search_url)
self._smart_delay()
if html:
data = self._extract_data(html)
self.cache.save_to_cache(query, data)
return data
except Exception as e:
print(f"尝试失败: {str(e)}")
if attempt == max_retries - 1:
if cached:
return cached
raise RuntimeError(f"爬取失败且无缓存可用: {str(e)}")
def _extract_data(self, html):
"""使用BeautifulSoup提取数据"""
soup = BeautifulSoup(html, 'html.parser')
# 添加针对不同搜索引擎的解析逻辑
results = []
for item in soup.select('.result, .res, .b_algo')[:10]: # 通用选择器
title = item.find('h3')
link = item.find('a', href=True)
if title and link:
results.append({
'title': title.get_text(strip=True),
'url': link['href'],
'snippet': item.find('p').get_text(strip=True)[:200] if item.find('p') else ''
})
return {'query': query, 'results': results}
def _fetch_html(self, url) -> str: def _fetch_html(self, url) -> str:
"""安全获取网页内容""" """安全获取网页内容"""
if not self._can_fetch(url): if not self._can_fetch(url):
@ -71,7 +152,7 @@ class CrawlerEngine:
try: try:
# 模拟搜索引擎查询(示例使用百度) # 模拟搜索引擎查询(示例使用百度)
search_url = f"https://www.baidu.com/s?wd={urllib.parse.quote(query)}" search_url = f"https://www.bing.com/search?q={urllib.parse.quote(query)}"
html = self._fetch_html(search_url) html = self._fetch_html(search_url)
data = self._extract_content(html) data = self._extract_content(html)