FC/FFAICilent/crawlers_core.py

import urllib.request
import urllib.robotparser
from urllib.parse import urlparse
import time
from bs4 import BeautifulSoup
import random
from urllib.parse import quote
from fake_useragent import UserAgent

class CrawlerEngine:
    def __init__(self, cache_manager):
        self.cache = cache_manager
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
            'Accept-Language': 'zh-CN,zh;q=0.9'
        }
        self.ua = UserAgent()
        self.search_engines = [
            "https://www.baidu.com/s?wd={}",
            "https://www.sogou.com/web?query={}",
            "https://cn.bing.com/search?q={}"
        ]
        self.delay_range = (2, 5)  # 随机延迟秒数

    
    def _can_fetch(self, url) -> bool:
        """检查robots.txt权限"""
        try:
            parsed = urlparse(url)
            base_url = f"{parsed.scheme}://{parsed.netloc}"
            rp = urllib.robotparser.RobotFileParser()
            rp.set_url(f"{base_url}/robots.txt")
            rp.read()
            return rp.can_fetch(self.headers['User-Agent'], url)
        except:
            return True
  
    def _get_random_header(self):
        return {
            'User-Agent': self.ua.random,
            'Accept': 'text/html,application/xhtml+xml',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Referer': 'https://www.google.com/'
        }

    def _smart_delay(self):
        time.sleep(random.uniform(*self.delay_range))

    def _bypass_anti_spider(self, url):
        """智能绕过反爬策略"""
        try:
            req = urllib.request.Request(
                url,
                headers=self._get_random_header(),
                method='GET'
            )
            # 添加代理支持（可选）
            # proxy = random.choice(proxies)
            # req.set_proxy(proxy, 'http')
            
            with urllib.request.urlopen(req, timeout=15) as response:
                if response.status == 200:
                    return response.read().decode('utf-8', errors='ignore')
                return None
        except Exception:
            return None

    def crawl(self, query, max_retries=3):
        """增强版爬取方法"""
        cached = self.cache.load_from_cache(query)
        if cached:
            return cached

        for attempt in range(max_retries):
            try:
                search_url = random.choice(self.search_engines).format(quote(query))
                print(f"尝试爬取: {search_url} (第{attempt+1}次)")
                
                html = self._bypass_anti_spider(search_url)
                self._smart_delay()
                
                if html:
                    data = self._extract_data(html)
                    self.cache.save_to_cache(query, data)
                    return data
            except Exception as e:
                print(f"尝试失败: {str(e)}")
                if attempt == max_retries - 1:
                    if cached:
                        return cached
                    raise RuntimeError(f"爬取失败且无缓存可用: {str(e)}")

    def _extract_data(self, html):
        """使用BeautifulSoup提取数据"""
        soup = BeautifulSoup(html, 'html.parser')
        # 添加针对不同搜索引擎的解析逻辑
        results = []
        for item in soup.select('.result, .res, .b_algo')[:10]:  # 通用选择器
            title = item.find('h3')
            link = item.find('a', href=True)
            if title and link:
                results.append({
                    'title': title.get_text(strip=True),
                    'url': link['href'],
                    'snippet': item.find('p').get_text(strip=True)[:200] if item.find('p') else ''
                })
        return {'query': query, 'results': results}

    def _fetch_html(self, url) -> str:
        """安全获取网页内容"""
        if not self._can_fetch(url):
            raise PermissionError(f"无权限爬取: {url}")
        
        req = urllib.request.Request(url, headers=self.headers)
        try:
            with urllib.request.urlopen(req, timeout=10) as response:
                if response.status == 200:
                    return response.read().decode('utf-8')
                raise ConnectionError(f"HTTP {response.status}")
        except Exception as e:
            raise ConnectionError(f"获取失败: {url} - {str(e)}")
    
    def _extract_content(self, html: str) -> dict:
        """从HTML提取结构化数据"""
        soup = BeautifulSoup(html, 'html.parser')
        
        # 移除不需要的标签
        for tag in ['script', 'style', 'nav', 'footer']:
            for element in soup(tag):
                element.decompose()
        
        # 提取核心内容
        title = soup.title.string if soup.title else ''
        text = ' '.join(p.get_text() for p in soup.find_all('p'))
        
        return {
            'title': title.strip(),
            'content': text.strip(),
            'links': [a['href'] for a in soup.find_all('a', href=True)]
        }
    
    def crawl(self, query: str, max_results=5) -> dict:
        """执行完整爬取流程"""
        # 先检查缓存
        cached = self.cache.load_from_cache(query)
        if cached:
            print(f"使用缓存数据: {query}")
            return cached
        
        print(f"开始爬取: {query}")
        results = []
        
        try:
            # 模拟搜索引擎查询（示例使用百度）
            search_url = f"https://www.bing.com/search?q={urllib.parse.quote(query)}"
            html = self._fetch_html(search_url)
            data = self._extract_content(html)
            
            # 限制抓取数量并添加延迟
            for link in data['links'][:max_results]:
                if link.startswith('http'):
                    try:
                        page_html = self._fetch_html(link)
                        page_data = self._extract_content(page_html)
                        results.append({
                            'source_url': link,
                            'title': page_data['title'],
                            'content': page_data['content']
                        })
                        time.sleep(self.delay)
                    except Exception as e:
                        print(f"子页面抓取失败: {link} - {str(e)}")
            
            # 保存结果到缓存
            result_data = {'query': query, 'results': results}
            self.cache.save_to_cache(query, result_data)
            return result_data
            
        except Exception as e:
            print(f"爬取失败: {str(e)}")
            if cached:
                return cached
            raise RuntimeError(f"爬取失败且无缓存可用: {str(e)}")
new file: FFAI/__pycache__/catch.cpython-313.pyc modified: FFAI/__pycache__/crawlers.cpython-313.pyc new file: FFAI/__pycache__/crawlers_core.cpython-313.pyc new file: 缓存文件 modified: 旧文件 new file: 爬虫文件 modified: 主文件 new file: cache/cache__E4_BA_BA_E5_B7_A5_E6_99_BA_E8_83_BD_json.txt new file: 测试文件 modified: readme.md 2025-06-07 09:01:37 +08:00			`import urllib.request`
			`import urllib.robotparser`
			`from urllib.parse import urlparse`
			`import time`
			`from bs4 import BeautifulSoup`
modified: FFAI/__pycache__/crawlers_core.cpython-313.pyc modified: FFAI/crawlers_core.py 2025-06-07 10:12:47 +08:00			`import random`
			`from urllib.parse import quote`
			`from fake_useragent import UserAgent`
new file: FFAI/__pycache__/catch.cpython-313.pyc modified: FFAI/__pycache__/crawlers.cpython-313.pyc new file: FFAI/__pycache__/crawlers_core.cpython-313.pyc new file: 缓存文件 modified: 旧文件 new file: 爬虫文件 modified: 主文件 new file: cache/cache__E4_BA_BA_E5_B7_A5_E6_99_BA_E8_83_BD_json.txt new file: 测试文件 modified: readme.md 2025-06-07 09:01:37 +08:00
			`class CrawlerEngine:`
			`def __init__(self, cache_manager):`
			`self.cache = cache_manager`
			`self.headers = {`
			`'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',`
			`'Accept-Language': 'zh-CN,zh;q=0.9'`
			`}`
modified: FFAI/__pycache__/crawlers_core.cpython-313.pyc modified: FFAI/crawlers_core.py 2025-06-07 10:12:47 +08:00			`self.ua = UserAgent()`
			`self.search_engines = [`
			`"https://www.baidu.com/s?wd={}",`
			`"https://www.sogou.com/web?query={}",`
			`"https://cn.bing.com/search?q={}"`
			`]`
			`self.delay_range = (2, 5) # 随机延迟秒数`

new file: FFAI/__pycache__/catch.cpython-313.pyc modified: FFAI/__pycache__/crawlers.cpython-313.pyc new file: FFAI/__pycache__/crawlers_core.cpython-313.pyc new file: 缓存文件 modified: 旧文件 new file: 爬虫文件 modified: 主文件 new file: cache/cache__E4_BA_BA_E5_B7_A5_E6_99_BA_E8_83_BD_json.txt new file: 测试文件 modified: readme.md 2025-06-07 09:01:37 +08:00
			`def _can_fetch(self, url) -> bool:`
			`"""检查robots.txt权限"""`
			`try:`
			`parsed = urlparse(url)`
			`base_url = f"{parsed.scheme}://{parsed.netloc}"`
			`rp = urllib.robotparser.RobotFileParser()`
			`rp.set_url(f"{base_url}/robots.txt")`
			`rp.read()`
			`return rp.can_fetch(self.headers['User-Agent'], url)`
			`except:`
			`return True`
modified: FFAI/__pycache__/crawlers_core.cpython-313.pyc modified: FFAI/crawlers_core.py 2025-06-07 10:12:47 +08:00
			`def _get_random_header(self):`
			`return {`
			`'User-Agent': self.ua.random,`
			`'Accept': 'text/html,application/xhtml+xml',`
			`'Accept-Language': 'zh-CN,zh;q=0.9',`
			`'Referer': 'https://www.google.com/'`
			`}`

			`def _smart_delay(self):`
			`time.sleep(random.uniform(*self.delay_range))`

			`def _bypass_anti_spider(self, url):`
			`"""智能绕过反爬策略"""`
			`try:`
			`req = urllib.request.Request(`
			`url,`
			`headers=self._get_random_header(),`
			`method='GET'`
			`)`
			`# 添加代理支持（可选）`
			`# proxy = random.choice(proxies)`
			`# req.set_proxy(proxy, 'http')`

			`with urllib.request.urlopen(req, timeout=15) as response:`
			`if response.status == 200:`
			`return response.read().decode('utf-8', errors='ignore')`
			`return None`
			`except Exception:`
			`return None`

			`def crawl(self, query, max_retries=3):`
			`"""增强版爬取方法"""`
			`cached = self.cache.load_from_cache(query)`
			`if cached:`
			`return cached`

			`for attempt in range(max_retries):`
			`try:`
			`search_url = random.choice(self.search_engines).format(quote(query))`
			`print(f"尝试爬取: {search_url} (第{attempt+1}次)")`

			`html = self._bypass_anti_spider(search_url)`
			`self._smart_delay()`

			`if html:`
			`data = self._extract_data(html)`
			`self.cache.save_to_cache(query, data)`
			`return data`
			`except Exception as e:`
			`print(f"尝试失败: {str(e)}")`
			`if attempt == max_retries - 1:`
			`if cached:`
			`return cached`
			`raise RuntimeError(f"爬取失败且无缓存可用: {str(e)}")`

			`def _extract_data(self, html):`
			`"""使用BeautifulSoup提取数据"""`
			`soup = BeautifulSoup(html, 'html.parser')`
			`# 添加针对不同搜索引擎的解析逻辑`
			`results = []`
			`for item in soup.select('.result, .res, .b_algo')[:10]: # 通用选择器`
			`title = item.find('h3')`
			`link = item.find('a', href=True)`
			`if title and link:`
			`results.append({`
			`'title': title.get_text(strip=True),`
			`'url': link['href'],`
			`'snippet': item.find('p').get_text(strip=True)[:200] if item.find('p') else ''`
			`})`
			`return {'query': query, 'results': results}`

new file: FFAI/__pycache__/catch.cpython-313.pyc modified: FFAI/__pycache__/crawlers.cpython-313.pyc new file: FFAI/__pycache__/crawlers_core.cpython-313.pyc new file: 缓存文件 modified: 旧文件 new file: 爬虫文件 modified: 主文件 new file: cache/cache__E4_BA_BA_E5_B7_A5_E6_99_BA_E8_83_BD_json.txt new file: 测试文件 modified: readme.md 2025-06-07 09:01:37 +08:00			`def _fetch_html(self, url) -> str:`
			`"""安全获取网页内容"""`
			`if not self._can_fetch(url):`
			`raise PermissionError(f"无权限爬取: {url}")`

			`req = urllib.request.Request(url, headers=self.headers)`
			`try:`
			`with urllib.request.urlopen(req, timeout=10) as response:`
			`if response.status == 200:`
			`return response.read().decode('utf-8')`
			`raise ConnectionError(f"HTTP {response.status}")`
			`except Exception as e:`
			`raise ConnectionError(f"获取失败: {url} - {str(e)}")`

			`def _extract_content(self, html: str) -> dict:`
			`"""从HTML提取结构化数据"""`
			`soup = BeautifulSoup(html, 'html.parser')`

			`# 移除不需要的标签`
			`for tag in ['script', 'style', 'nav', 'footer']:`
			`for element in soup(tag):`
			`element.decompose()`

			`# 提取核心内容`
			`title = soup.title.string if soup.title else ''`
			`text = ' '.join(p.get_text() for p in soup.find_all('p'))`

			`return {`
			`'title': title.strip(),`
			`'content': text.strip(),`
			`'links': [a['href'] for a in soup.find_all('a', href=True)]`
			`}`

			`def crawl(self, query: str, max_results=5) -> dict:`
			`"""执行完整爬取流程"""`
			`# 先检查缓存`
			`cached = self.cache.load_from_cache(query)`
			`if cached:`
			`print(f"使用缓存数据: {query}")`
			`return cached`

			`print(f"开始爬取: {query}")`
			`results = []`

			`try:`
			`# 模拟搜索引擎查询（示例使用百度）`
modified: FFAI/__pycache__/crawlers_core.cpython-313.pyc modified: FFAI/crawlers_core.py 2025-06-07 10:12:47 +08:00			`search_url = f"https://www.bing.com/search?q={urllib.parse.quote(query)}"`
new file: FFAI/__pycache__/catch.cpython-313.pyc modified: FFAI/__pycache__/crawlers.cpython-313.pyc new file: FFAI/__pycache__/crawlers_core.cpython-313.pyc new file: 缓存文件 modified: 旧文件 new file: 爬虫文件 modified: 主文件 new file: cache/cache__E4_BA_BA_E5_B7_A5_E6_99_BA_E8_83_BD_json.txt new file: 测试文件 modified: readme.md 2025-06-07 09:01:37 +08:00			`html = self._fetch_html(search_url)`
			`data = self._extract_content(html)`

			`# 限制抓取数量并添加延迟`
			`for link in data['links'][:max_results]:`
			`if link.startswith('http'):`
			`try:`
			`page_html = self._fetch_html(link)`
			`page_data = self._extract_content(page_html)`
			`results.append({`
			`'source_url': link,`
			`'title': page_data['title'],`
			`'content': page_data['content']`
			`})`
			`time.sleep(self.delay)`
			`except Exception as e:`
			`print(f"子页面抓取失败: {link} - {str(e)}")`

			`# 保存结果到缓存`
			`result_data = {'query': query, 'results': results}`
			`self.cache.save_to_cache(query, result_data)`
			`return result_data`

			`except Exception as e:`
			`print(f"爬取失败: {str(e)}")`
			`if cached:`
			`return cached`
			`raise RuntimeError(f"爬取失败且无缓存可用: {str(e)}")`