diff --git a/FFAI/__pycache__/crawlers_core.cpython-313.pyc b/FFAI/__pycache__/crawlers_core.cpython-313.pyc index d09c6e8..bbb715e 100644 Binary files a/FFAI/__pycache__/crawlers_core.cpython-313.pyc and b/FFAI/__pycache__/crawlers_core.cpython-313.pyc differ diff --git a/FFAI/crawlers_core.py b/FFAI/crawlers_core.py index af6d6ec..0a02008 100644 --- a/FFAI/crawlers_core.py +++ b/FFAI/crawlers_core.py @@ -3,6 +3,9 @@ import urllib.robotparser from urllib.parse import urlparse import time from bs4 import BeautifulSoup +import random +from urllib.parse import quote +from fake_useragent import UserAgent class CrawlerEngine: def __init__(self, cache_manager): @@ -11,7 +14,14 @@ class CrawlerEngine: 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)', 'Accept-Language': 'zh-CN,zh;q=0.9' } - self.delay = 2 # 爬取延迟(秒) + self.ua = UserAgent() + self.search_engines = [ + "https://www.baidu.com/s?wd={}", + "https://www.sogou.com/web?query={}", + "https://cn.bing.com/search?q={}" + ] + self.delay_range = (2, 5) # 随机延迟秒数 + def _can_fetch(self, url) -> bool: """检查robots.txt权限""" @@ -24,7 +34,78 @@ class CrawlerEngine: return rp.can_fetch(self.headers['User-Agent'], url) except: return True - + + def _get_random_header(self): + return { + 'User-Agent': self.ua.random, + 'Accept': 'text/html,application/xhtml+xml', + 'Accept-Language': 'zh-CN,zh;q=0.9', + 'Referer': 'https://www.google.com/' + } + + def _smart_delay(self): + time.sleep(random.uniform(*self.delay_range)) + + def _bypass_anti_spider(self, url): + """智能绕过反爬策略""" + try: + req = urllib.request.Request( + url, + headers=self._get_random_header(), + method='GET' + ) + # 添加代理支持(可选) + # proxy = random.choice(proxies) + # req.set_proxy(proxy, 'http') + + with urllib.request.urlopen(req, timeout=15) as response: + if response.status == 200: + return response.read().decode('utf-8', errors='ignore') + return None + except Exception: + return None + + def crawl(self, query, max_retries=3): + """增强版爬取方法""" + cached = self.cache.load_from_cache(query) + if cached: + return cached + + for attempt in range(max_retries): + try: + search_url = random.choice(self.search_engines).format(quote(query)) + print(f"尝试爬取: {search_url} (第{attempt+1}次)") + + html = self._bypass_anti_spider(search_url) + self._smart_delay() + + if html: + data = self._extract_data(html) + self.cache.save_to_cache(query, data) + return data + except Exception as e: + print(f"尝试失败: {str(e)}") + if attempt == max_retries - 1: + if cached: + return cached + raise RuntimeError(f"爬取失败且无缓存可用: {str(e)}") + + def _extract_data(self, html): + """使用BeautifulSoup提取数据""" + soup = BeautifulSoup(html, 'html.parser') + # 添加针对不同搜索引擎的解析逻辑 + results = [] + for item in soup.select('.result, .res, .b_algo')[:10]: # 通用选择器 + title = item.find('h3') + link = item.find('a', href=True) + if title and link: + results.append({ + 'title': title.get_text(strip=True), + 'url': link['href'], + 'snippet': item.find('p').get_text(strip=True)[:200] if item.find('p') else '' + }) + return {'query': query, 'results': results} + def _fetch_html(self, url) -> str: """安全获取网页内容""" if not self._can_fetch(url): @@ -71,7 +152,7 @@ class CrawlerEngine: try: # 模拟搜索引擎查询(示例使用百度) - search_url = f"https://www.baidu.com/s?wd={urllib.parse.quote(query)}" + search_url = f"https://www.bing.com/search?q={urllib.parse.quote(query)}" html = self._fetch_html(search_url) data = self._extract_content(html)