From a7abe9d50625d6be1a38bd32b189bfa81623bf3d Mon Sep 17 00:00:00 2001 From: Friendfeng <3880261409@qq.com> Date: Sat, 7 Jun 2025 10:12:47 +0800 Subject: [PATCH] modified: FFAI/__pycache__/crawlers_core.cpython-313.pyc modified: FFAI/crawlers_core.py --- .../__pycache__/crawlers_core.cpython-313.pyc | Bin 6042 -> 9989 bytes FFAI/crawlers_core.py | 87 +++++++++++++++++- 2 files changed, 84 insertions(+), 3 deletions(-) diff --git a/FFAI/__pycache__/crawlers_core.cpython-313.pyc b/FFAI/__pycache__/crawlers_core.cpython-313.pyc index d09c6e803f071601d28086fcc11f7d71e0a2dfa0..bbb715ebc4ced9f919b99105a0940d2272f172e9 100644 GIT binary patch delta 5074 zcmahtTW}lKb$5YXyoeVGkbp>nfJBKBObV33hebvYiy|#clqiQwIL^=(yabjYY!HC% zE=-9|3Z;*fY2#Y-S`w8`Ew-_z)WlBJk=wCTWvtl#P}QTF+ z{Pdh%f?^e?xfq^%_S|#Nz4zSny8L15pA0zfIUF_u?Y%emjg8h#JN?Yq^yZEoJmHys z3$;Mc_8SjcsWn6Z=lX32D`-WC_{j#sn`#Mf?l9nVLa05-ge}u@w#nSp>uy zl|;HXDQ4uj&Za|d<>TJT>7O(exLQKZC=J##1GMs$yrq`B$WR+^h3^X9_?*4nz}tYv zQU_o0oE<36c9yqm6c=B`J635oK+(!j5 z0>r<2Bg#%@Q_paDd!z%$p+!j=$Fzl{W3rr+x?5W(CMF^yLMoAu#IxhA(w>RLu2;S@xnb?Fluc&! z;R$hM&#}BnPasdQ%oEQ zTCIGO4KqIFy8vsIAG2qC3=EVT?pnK%9YlHFSyxzdU1b)Y;SIczXCv%JkT)zF*AZV__1bSf^$scfe8I8vIAkEbafVbWLD8>&8;%_h^Lrn)do?J%yI^xL8i{4fuS zqaqb)xL}}F+9;?cksZIl&}#UUaK|C)!;z|3Qj}3^FdWmBtcL!N-9>(zqX~tC0GuQ@ zY8%dPKDYUN-VdOL!2B^2euh7*7GQ4W7 zgC=E{+gGrZ`5*S3q+?};5J{2?%s`k^SvfT>s+^P-#T<}k^QdyYQHDc^8;IB?iOvr=KJ^`hN$88e2p8dENEO-hr=71UxO zEU>2WT=%nt2%8}a7?o{CuM)6X3WKgS94jbq;?2CJ%ODweEB?xnF+?`PE+3^Ot?Nk( z*|IV>0cROCgh-Ht;Fq-SG6u;gvz`pQ%j=kl0H;Qdq;<4FbcV1k?+1pP=YIP6AN}ms zKb`*k!#`bm?bTaT@7#L#%&pUJtH%9(Ltz6&ms3sSqCA#O+{T?>U{o_+Y&I|7z6#$D zxV${Nb%$z7B{Nxyf!icfnx)b}*h+&q+e}4>J(7$*aV)Pao7uuPjh?6JY);ImCP^0L zyhIz3YD$RlY(iA6eaA8Ofggf@q07)sIBW!As)dSDE}M}=2~9HuP4^^EY~(~vkffN9 zky9}#mx6euu7AXkfaS;|JljEv^d^9lWSMnY{nxAfXCu>*#i}jyRa;7KPsv}u=x?3( zw=Vd%mwffhX5tAi6V~IL+IPQ-RQZ)pEp3G>TZ(nti*@~9a-^zjzH-O1iTLZkG{eVY z<&Ixh?)VJ^I`_JJgt7Yn&Es1(LjOS835)*W-in?O`MD#~<1+sI>sG)&;uyd`GFcFJ z*?I!zkJcFx4`?uC>)C4jsD%L-H-Z4IIRT<$BH`Yo0YYEjCb0deaU*${l%j^Rn|jDo z7YNC)4Me%^Y*3E4x$3Vl;0{Kyn`#dXubOO>DGMm~8EzPDL~A?~<(z~!L|NXr6)1qS zZCsf%M50R8RPBzgQZgoRt!i+s`l_g@Y|jE12atW>dqHKySyNcIB7IHwtnZ{m2nGbGWI-SRRUb^^Kpa0!yo!lMDW5x?@x%JLFw;90l=o2^J`Rni+)}^=p z{`2?V?hXz7xAJz?rh=PdY*1b4tdNL}Qb;>`is!V{t;cFpd@P%ai`0XhCXCvGOmQWZ z%h6Ouru~4^$C2QOLHeXZ974As3#vmBUKV3=Ru@X2LKdSWQ`J702frK_pX;;-s3EF_{SXuqL*IOug)=eG$%vV1%{A*u$YX7pq?HFRN2O7^m|HJ3!+83L5 z&o}R02<%x5JUJhD@{|14=*Qn)2n?OFU3XW{IAUtm5s%k{ny-2 z-SE{d`da6Gtru+zzOSEROK$I?d;Pq7{WW*vO^>fs7hJ5{I$yVSF0oKoXgk$g^7_vn zoIbedZJGDB%tfzxyMX2O{yS#k_1~=|{>T33zMpsl_Xvku!*$=1=C=Rn<=L@fP3txH zwvwlIZtuD5f@kZ0_yVQirt^;LwN1tFfq!khD*Q{+Rey2w;6g25^zvVT#K>=!Ih6B2 z+6cOMud!#tz6Qg`4YuBH?&B?+dpo#KTnyl!_nH`{bV%0F#rZU11q+xc9ildNkIqUSP4!+vl^NLC#7i(O{X$1NoGJbcU(!#G>T(&k0j!? zLF0K~kUju#l6>Z>pBb9nwcy%5#onm$PMJ&Ig8yv)bbm3lW5L_?e&hRJTk!6l(%k95 z^nsa#a%8HnMYy z#lYj>d9$W-J4=V2{^fyR94J2XtzQoOVqiu-f8yMUInQ7I!D6s;KG^yGP%+q94E9`k zto_Qq`QTH<7e?-w*-+INgst@4v6AXNko{|ebC1uv+KQGot=_!!1lMcuV@A}ji|#@U zuY^7UKzYmDSO^26x}Fy4cuJC>zR{wgRw~hb$k>ZOlhB9QlL#6S><6H__GL2}xT@8O z7+>i(0pQCBalCMP0Kq{7F949xAG8~R04Z3Snej`nT0#s*@hUG}vMvIe08}|!(ZJG| z0Qtv?qaoS`q%cDz>n7sDvf87HT`}dztSk6ZXgIkWTSAF+`JyqA`YznoTGl z+8bk#k%mFrY$2` zfJHkH060^xql0-xnRI-@X_k7a&w0jJz*mOudbchsOYhF;-UbO%%b}M~F8%GFmviW3 z!`jE)Z@-G^_d(VGrXD$`?S)PO0i4VRZb0K=3=-2rdLG|(ONvnCE1Up~W&mi5>dTe0 zF+GAZDz#)xpGL+Z1Q;57yBE)}S{Azx&37NV8eQn-i(Ny-bulX6gQPnsi zwQGS{tHt!P%Q-5%1a<}~0&WF70w1&buHHs$b*HLj=Xf@ePm8-LRKx^|AR;v@IiIio Z--+W>((ozS_$51TU=A^NiPmXT{})5d;jsV! delta 1512 zcmYKeT})e5_@3T-`~L@nwWU84Dr_Btfl@HbG(jvI2@=M2nHsWsY0uiu+n=3#3k+va z6AhUf!83U3ERWc8DC6%(a2+sm@S7a@nNWaYe1uk#(2&x;7RUxzVrS6-gm}7 zIpy4QIIJ8!HztNNS6$0aFF3b+Eavrc?VKdEaFQ4^FgC{_a~edA%QqvQWSvll;{Idr z((>cZ1%ArZZ;-?m?uvlSl5xr;@l%$5K{DwSt7MnVhcpYLT>;1@S&=<$i#mv77|F8= zMDct&n}<~sHJ#k|u6yArWZ&MC6nxCRG6{=X_n4edm*h08GRY`U#H5_eKw`>3IR%kQ#1vHIYaj7p@H}qgM~r4F zrncf7@3Yj*+)>Y>@PW0w#?Jy^!9j7#Y-eUf4 zh)K)lV4l!tu}fMedq@SdWKRETh^KLiLK@?UC;f^nLdISsXh z2IsV>ff#yv2`H+pxVph_{6ipyLEuE_Ip!V6FH!{|gWzQ}LJ6z^T2@ma9tlLiS$rkn zA0Z-*Ia`dH>roFfJwYv$iiFqXG%-O1=3rh!XLSbii{cbjS*KVU?9fy;12J#{{~P$$ zL_}Icbgle-aD}({s8sDU{B{`2ZOy(T`s@9r@Xpt7?tJmyuZwqfK3q8+p7-Nd4E_Z) zO4;MA5L!`pht5#~joC`WK}nb>A?1I1$wQetcbBTSzN~(F`&9$ PpF#qFF|f<&mu}-fDIcl9 diff --git a/FFAI/crawlers_core.py b/FFAI/crawlers_core.py index af6d6ec..0a02008 100644 --- a/FFAI/crawlers_core.py +++ b/FFAI/crawlers_core.py @@ -3,6 +3,9 @@ import urllib.robotparser from urllib.parse import urlparse import time from bs4 import BeautifulSoup +import random +from urllib.parse import quote +from fake_useragent import UserAgent class CrawlerEngine: def __init__(self, cache_manager): @@ -11,7 +14,14 @@ class CrawlerEngine: 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)', 'Accept-Language': 'zh-CN,zh;q=0.9' } - self.delay = 2 # 爬取延迟(秒) + self.ua = UserAgent() + self.search_engines = [ + "https://www.baidu.com/s?wd={}", + "https://www.sogou.com/web?query={}", + "https://cn.bing.com/search?q={}" + ] + self.delay_range = (2, 5) # 随机延迟秒数 + def _can_fetch(self, url) -> bool: """检查robots.txt权限""" @@ -24,7 +34,78 @@ class CrawlerEngine: return rp.can_fetch(self.headers['User-Agent'], url) except: return True - + + def _get_random_header(self): + return { + 'User-Agent': self.ua.random, + 'Accept': 'text/html,application/xhtml+xml', + 'Accept-Language': 'zh-CN,zh;q=0.9', + 'Referer': 'https://www.google.com/' + } + + def _smart_delay(self): + time.sleep(random.uniform(*self.delay_range)) + + def _bypass_anti_spider(self, url): + """智能绕过反爬策略""" + try: + req = urllib.request.Request( + url, + headers=self._get_random_header(), + method='GET' + ) + # 添加代理支持(可选) + # proxy = random.choice(proxies) + # req.set_proxy(proxy, 'http') + + with urllib.request.urlopen(req, timeout=15) as response: + if response.status == 200: + return response.read().decode('utf-8', errors='ignore') + return None + except Exception: + return None + + def crawl(self, query, max_retries=3): + """增强版爬取方法""" + cached = self.cache.load_from_cache(query) + if cached: + return cached + + for attempt in range(max_retries): + try: + search_url = random.choice(self.search_engines).format(quote(query)) + print(f"尝试爬取: {search_url} (第{attempt+1}次)") + + html = self._bypass_anti_spider(search_url) + self._smart_delay() + + if html: + data = self._extract_data(html) + self.cache.save_to_cache(query, data) + return data + except Exception as e: + print(f"尝试失败: {str(e)}") + if attempt == max_retries - 1: + if cached: + return cached + raise RuntimeError(f"爬取失败且无缓存可用: {str(e)}") + + def _extract_data(self, html): + """使用BeautifulSoup提取数据""" + soup = BeautifulSoup(html, 'html.parser') + # 添加针对不同搜索引擎的解析逻辑 + results = [] + for item in soup.select('.result, .res, .b_algo')[:10]: # 通用选择器 + title = item.find('h3') + link = item.find('a', href=True) + if title and link: + results.append({ + 'title': title.get_text(strip=True), + 'url': link['href'], + 'snippet': item.find('p').get_text(strip=True)[:200] if item.find('p') else '' + }) + return {'query': query, 'results': results} + def _fetch_html(self, url) -> str: """安全获取网页内容""" if not self._can_fetch(url): @@ -71,7 +152,7 @@ class CrawlerEngine: try: # 模拟搜索引擎查询(示例使用百度) - search_url = f"https://www.baidu.com/s?wd={urllib.parse.quote(query)}" + search_url = f"https://www.bing.com/search?q={urllib.parse.quote(query)}" html = self._fetch_html(search_url) data = self._extract_content(html)