FC/FFAI/crawlers_core.py

import urllib.request
import urllib.robotparser
from urllib.parse import urlparse
import time
from bs4 import BeautifulSoup

class CrawlerEngine:
    def __init__(self, cache_manager):
        self.cache = cache_manager
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
            'Accept-Language': 'zh-CN,zh;q=0.9'
        }
        self.delay = 2  # 爬取延迟(秒)

    def _can_fetch(self, url) -> bool:
        """检查robots.txt权限"""
        try:
            parsed = urlparse(url)
            base_url = f"{parsed.scheme}://{parsed.netloc}"
            rp = urllib.robotparser.RobotFileParser()
            rp.set_url(f"{base_url}/robots.txt")
            rp.read()
            return rp.can_fetch(self.headers['User-Agent'], url)
        except:
            return True

    def _fetch_html(self, url) -> str:
        """安全获取网页内容"""
        if not self._can_fetch(url):
            raise PermissionError(f"无权限爬取: {url}")

        req = urllib.request.Request(url, headers=self.headers)
        try:
            with urllib.request.urlopen(req, timeout=10) as response:
                if response.status == 200:
                    return response.read().decode('utf-8')
                raise ConnectionError(f"HTTP {response.status}")
        except Exception as e:
            raise ConnectionError(f"获取失败: {url} - {str(e)}")

    def _extract_content(self, html: str) -> dict:
        """从HTML提取结构化数据"""
        soup = BeautifulSoup(html, 'html.parser')

        # 移除不需要的标签
        for tag in ['script', 'style', 'nav', 'footer']:
            for element in soup(tag):
                element.decompose()

        # 提取核心内容
        title = soup.title.string if soup.title else ''
        text = ' '.join(p.get_text() for p in soup.find_all('p'))

        return {
            'title': title.strip(),
            'content': text.strip(),
            'links': [a['href'] for a in soup.find_all('a', href=True)]
        }

    def crawl(self, query: str, max_results=5) -> dict:
        """执行完整爬取流程"""
        # 先检查缓存
        cached = self.cache.load_from_cache(query)
        if cached:
            print(f"使用缓存数据: {query}")
            return cached

        print(f"开始爬取: {query}")
        results = []

        try:
            # 模拟搜索引擎查询（示例使用百度）
            search_url = f"https://www.baidu.com/s?wd={urllib.parse.quote(query)}"
            html = self._fetch_html(search_url)
            data = self._extract_content(html)

            # 限制抓取数量并添加延迟
            for link in data['links'][:max_results]:
                if link.startswith('http'):
                    try:
                        page_html = self._fetch_html(link)
                        page_data = self._extract_content(page_html)
                        results.append({
                            'source_url': link,
                            'title': page_data['title'],
                            'content': page_data['content']
                        })
                        time.sleep(self.delay)
                    except Exception as e:
                        print(f"子页面抓取失败: {link} - {str(e)}")

            # 保存结果到缓存
            result_data = {'query': query, 'results': results}
            self.cache.save_to_cache(query, result_data)
            return result_data

        except Exception as e:
            print(f"爬取失败: {str(e)}")
            if cached:
                return cached
            raise RuntimeError(f"爬取失败且无缓存可用: {str(e)}")