From b04a959a2720682d79c2774131193326f97a6366 Mon Sep 17 00:00:00 2001 From: Friendfeng <3880261409@qq.com> Date: Sat, 7 Jun 2025 01:51:02 +0800 Subject: [PATCH] modified: FFAIall/main.py modified: FFAInobug/__pycache__/crawlers.cpython-313.pyc modified: FFAInobug/crawlers.py modified: FFAInobug/main.py --- FFAIall/main.py | 4 +- .../__pycache__/crawlers.cpython-313.pyc | Bin 4682 -> 6527 bytes FFAInobug/crawlers.py | 74 ++++++++++++++---- FFAInobug/main.py | 22 +++--- 4 files changed, 72 insertions(+), 28 deletions(-) diff --git a/FFAIall/main.py b/FFAIall/main.py index 414b8ce..7e5622f 100644 --- a/FFAIall/main.py +++ b/FFAIall/main.py @@ -1,5 +1,7 @@ from crawlers import PureHTMLParser # type: ignore from analyzer import PureAnalyzer # type: ignore +import sys +import os class PureInfoHunter: def __init__(self): @@ -37,8 +39,6 @@ class PureInfoHunter: print(f"报告已保存到 reports/{safe_query}_report.txt") if __name__ == "__main__": - import sys - import os os.makedirs("reports", exist_ok=True) if len(sys.argv) < 2: diff --git a/FFAInobug/__pycache__/crawlers.cpython-313.pyc b/FFAInobug/__pycache__/crawlers.cpython-313.pyc index 8bfb92aa51875b57ef4fb4ddd4822aa4ba932db3..7502fc19f69a55b3503b2286a10093731446d461 100644 GIT binary patch delta 3385 zcmai0Yfu~46}~I&>a}_S!V<_b!Uk-)M)(2P=3!%O0|D$hlGUWZL|w#Utl<>gh+elJ34aCK5cbTXQ#I}q=P6(^%$^$eCZxz zw+WlPi117g-OXawi(F_c3hK&HP(Pq=(82@=Ve=WvXLtzgNgpLMuci_rPZ(3z=@Y%D zI4tm(^epW>mlC7HsaPy@jIH4Suh#Q8Boj}E}E90fL_R;2+(`Z=??e8*~ch(hQLwNXVH zG;EhXVvOmyrr)4W6kp=Wx%|$><;#;DJ{>M5<1(G(#1UD?k4J@9L_s#3;l)ugI?BtmLd_8_ zGQx-X@njUA_i3>MbW8=|a5TotMxbH93UDG`uNRjB#uknW;R5a)7mE(bJIN(%=DsNe zq|imk1$5t5bZPA3*rnGmzCL?4XY*xkzI=)6zGKr&=X7VzQI&O6&Gjuhc263Y?2Z}Z zTSkeUjb`oJ=j(IdX~~kGttDethw+&~UVPfdP1+(5yQ3gFtLX4b7Ms7eED8QJB}^JBJ%O4B6XzTH2`JkK@a zSzy$2t>35$LR15BV`x^AWcFzqR{cJ%4QbXKeLG5ni?RjaMjFzp>MEEc0ZFiyji>^@ zj;%slP*C&#jo7D&)BRReJ76zx;Nfd+D|)#tP%LGc5>^|e4{GB~hzXQRcMRorFY0%y zI3jcbsFYdTlrBWoQE`1p7jP>hI@S6T)O7)&xFKW+cqB^akbYn^XaW_=8it_pAQd-~ zp-M%`q$&kjxcRtAQ6dYKnbncC!`MO&1EvI9*G3ETrj>cWO9d@)Hlz=LLqFF-#*it* z2Ce?JL-wK*KSqc}UKFB2`u=KFp&vX_X@NCPm2Av5s(m54j!Fk@emZU@cja4$s@%&h z>Y0(H%vGoo_3u>2^oScJ+qP`XeCpWxH&DCItK7N=kON99QjWm->r_C{-VgDjw2(!$ z){2N(-k{GQM4v!7BVu&W6M%It;+z^iiSr1t>krF4^ z5pgu;Pb$@RqQsARAr%w-k=jTiE~dF?oX4_eWZ!8Yg}ccVGR4XC2_*NH-t*H^wFo596l z67UE^lEIZgUL-^|V4kF9fz%-_sYCcQ$n2OD4L&@az+pk;#B@r)O6G(_DeDA2#z#bw z(v@#G5$BZ>78dz&QKpBZ@gdv`Ky5TGVh`bPJ9Lf5$0K}Fj3(krYL%HJCIDGW&Q3N5 zQgJwr;}|Ef&$^!ZWiv5?i}2x8a)=Xo4EYyU=5S;roWe0MUq~k60x6#45R({O9UlSm zeSp%jX#9t`5Bh}~!V8eGpm?k|Qbz7tccsBf;Zx$hq%xK|XI`3qX=Y%0V6N(#|6TvB zz6EBnp(9h@xsb~2J+WANa?#e`hm-Dt|y>0mtXRc&Nwq(bA`C>_Z z#$3NbR~vh&zq=c6O)OL|?(fPpzK|*Fp6p(7R?NKi)@yU!pO`;3e?FLXreB=w%9|ZI zb9L5Seb2o8i4Hj`=Umr3?|N=n7E2CI9?QEtQ_;!J6&)(`TxGAYnW}?-bu~@VdDrHv zwkx)IdeK#zb2VmNjkix{T#Xr5`xKLRx@XQ$pPws{UY{GzI%^3!@V#7s%Lcg2U?LYt zW&_FD`m2Yp98S--F1ioq-0fL+`{#}fd{1W1jZYc!cIS+B+M4%l&ATh|-W`whw72*v zqKir%8BtmD$|h7^k#}!h@_0XRTq}F8?0xs5r#8R2X13*#k#SfjkFD5|r6^~v$eJtW zO!v%t$Qe!c{Oi8W^%ybaY*wEk>5cD@C3|Vc)qLmZ0`gf>>p~5RdyKAU4yCf zu=Z~85w^2Q`XWHVQ@B~hmbeW6AjJy|6$fKtt)ovEBpTRv)#2! zFNGED37FzMAt%WrQGkvJBn9Lg5Ct#}9H0tH@RY*_m4N*M*#8E4X5c7IFZBc|p8^Y9gg);v)*OUz{qluwZj6Z}g m5DgMBNT`*(#m+5%L$)uF=L@v$3C&YfFE!QoPoyx44gUj>AqGwW delta 1698 zcmZ8hYfKbZ6ux&}JNuYj=(2(k$VvcREGy5FfD$T7uoOy|(pcHDt}y7fyu7m;Y-_Tn zNeyYVl}xbwFp;ExXi}>QO>2rI^{0(VK}euOZT!JQ`=_*OOnYI_}-e@98ySBA(yCP#8Jbe0U=C2O+(TXWZOg3 z6wfAy6B;nCmbcs9?1(X4W8)9HLtmpcy<;7in%D(nbNBBKT{5{g`o|tEX;5=BsAv#Ql`| zjee;O@H?qdLFW+9UV(*W8`O~l@?1wF%{00lwRG2v;rM}86ob|jM_C-oBc^T9VK2&Q zzbSxXP#0krL}Gs9V`l?6|L9!f+@)x_AK;@(Bhwi+f=;2C<)ZKaLK(mlbYl+Gp3}Si|YxURT}w9INQz~wXWV+?-?!HA0NZ)>>7R>yrWKw??#YT2<`@jvUub3D;uEG@G#lIBA6hgYwb3%wpYrq95NV@o< z?HX>IhI9(H)pqQ}Puf8$uL=V3lXnE2I{?k|la^hCNW(|ENT<`d%llKp%dn~#<~q2x z975c1_>edxfQ@@cJlGx&)Zh#NL5gA#1P#J}Na!aaDT$3*yED1|>Dc3&v+G}6-MnM0 z-U}aZf;}Z*2d*;lR?y9 zzjP(IapNm+w7oXx#y4lL*%sk&xSiF36*~wJlGrg=OmT$8hZ4+d@dNRp4|LO`olCG- zZz7sN<~7^c?kBe1^z;w2UM-p&>5C;aQ?lCIXA0E`wI&Wnt}O6s3S;OgfD2^9 zm$wqwy&NcA2$W9OECwpZnt%5Ne#;2V@2XtNs9MR+zoM+k%DlIHNv?R*`qnS?_v)8h zBMYsO`S+spJ<)~MlZ(FCgVvKjRZsOTm((tl)ZP{rOAak%G=OSMnD@mV$)067Z$ZwR zJh~+BTk-i*WA1ACT1nCB}sfdIqzC_-1 zVYEK0G$GEub2!fFi_ZMOvvvS;Z%2}8_KRiIIWo()7xgH5$N(3}nlFD|&Sx-Yc;E5b zTfJb45zTHX&Vs$k>j`GvHmou5SWT;WtP{C*<9>F283wRUH{Ez}6N`<)ePd1#>pjBzUtZKs(IM zaooI%Z~_7AWE=D#hgP8YwNfa^g9`)k@$$*T<7) diff --git a/FFAInobug/crawlers.py b/FFAInobug/crawlers.py index e0130b2..f3c09c2 100644 --- a/FFAInobug/crawlers.py +++ b/FFAInobug/crawlers.py @@ -3,8 +3,12 @@ import os import time from urllib.parse import quote from html.parser import HTMLParser +import requests # type: ignore +from bs4 import BeautifulSoup # type: ignore +from urllib.parse import quote_plus class PureHTMLParser(HTMLParser): + # ...(保持之前的HTML解析器代码不变)... class PureCrawler: @@ -14,6 +18,14 @@ class PureHTMLParser(HTMLParser): self.cache_dir = cache_dir os.makedirs(cache_dir, exist_ok=True) + def _is_cache_valid(self, cache_file): + """检查缓存是否有效""" + if not os.path.exists(cache_file): + return False + + file_time = os.path.getmtime(cache_file) + return (time.time() - file_time) < self.cache_expiry + def _get_cache_path(self, query: str) -> str: """生成缓存文件名""" safe_query = "".join(c if c.isalnum() else "_" for c in query) @@ -51,21 +63,49 @@ class PureHTMLParser(HTMLParser): items.append({"url": url, "text": text}) return items - def fetch(self, query: str, force_update=False) -> list: - """优先读取缓存,不存在时爬取""" - if not force_update: - cached = self._load_from_cache(query) - if cached: - print("📂 从缓存加载数据") - return cached + def fetch(self, query, force_update=False): + + cache_file = os.path.join(self.cache_dir, f"{quote_plus(query)}.json") - print("🌐 正在爬取网络数据...") - # ...(保持原有的爬取逻辑)... - data = [{"url": link, "text": self.extract_text(link)} for link in self.parser.links[:5]] - - self._save_to_cache(query, data) - return data - - def extract_text(self, url: str) -> str: - # ...(保持原有的正文提取逻辑).. - return extracted_text \ No newline at end of file + # 检查缓存是否有效 + if not force_update and self._is_cache_valid(cache_file): + return self._load_from_cache(cache_file) + + try: + # 实际抓取逻辑 - 以百度搜索为例 + search_url = f"https://www.baidu.com/s?wd={quote_plus(query)}" + response = requests.get(search_url, headers=self.headers, timeout=10) + response.raise_for_status() + + # 解析网页内容 + soup = BeautifulSoup(response.text, 'html.parser') + results = [(query)] + + # 提取搜索结果 - 根据实际网站结构调整 + for item in soup.select('.result.c-container'): + title = item.select_one('h3').get_text(strip=True) + link = item.find('a')['href'] + abstract = item.select_one('.c-abstract').get_text(strip=True) if item.select_one('.c-abstract') else "" + results.append({ + 'title': title, + 'url': link, + 'abstract': abstract + }) + + data = { + 'query': query, + 'results': results, + 'timestamp': int(time.time()), + 'sources': [search_url] + } + + # 保存到缓存 + self._save_to_cache(cache_file, data) + return data + + except Exception as e: + # 如果抓取失败但缓存存在,使用缓存 + if os.path.exists(cache_file): + print(f"抓取失败,使用缓存数据: {str(e)}") + return self._load_from_cache(cache_file) + raise RuntimeError(f"抓取失败且无缓存可用: {str(e)}") \ No newline at end of file diff --git a/FFAInobug/main.py b/FFAInobug/main.py index 5cc8931..594b618 100644 --- a/FFAInobug/main.py +++ b/FFAInobug/main.py @@ -41,19 +41,23 @@ if __name__ == "__main__": import os os.makedirs("reports", exist_ok=True) + # 处理参数缺失的情况 if len(sys.argv) < 2: print("使用方法: python pure_main.py '搜索关键词' [force_update]") print("示例: python pure_main.py '人工智能' true") - # sys.exit(1) + query = input("请输入要搜索的关键词: ") # 改为交互式输入 + force_update = input("是否强制更新(true/false)? ").lower() == "true" + else: + query = sys.argv[1] + force_update = len(sys.argv) > 2 and sys.argv[2].lower() == "true" - force_update = len(sys.argv) > 2 and sys.argv[2].lower() == "true" hunter = PureInfoHunter() - # if force_update: - # print("强制更新模式(忽略缓存)") - # data = hunter.crawler.fetch(sys.argv[1], force_update=True) - # result = hunter.analyzer.analyze(data, sys.argv[1]) - # else: - # result = hunter.run(sys.argv[1]) + if force_update: + print("强制更新模式(忽略缓存)") + data = hunter.crawler.fetch(query, force_update=True) + result = hunter.analyzer.analyze(data, query) + else: + result = hunter.run(query) - # print(result) \ No newline at end of file + print(result) \ No newline at end of file