From 6b7ae8f26edb841c5eb5d25af2f3ad9303793971 Mon Sep 17 00:00:00 2001 From: Friendfeng <3880261409@qq.com> Date: Sat, 7 Jun 2025 09:01:37 +0800 Subject: [PATCH] =?UTF-8?q?=09new=20file:=20=20=20FFAI/=5F=5Fpycache=5F=5F?= =?UTF-8?q?/catch.cpython-313.pyc=20=09modified:=20=20=20FFAI/=5F=5Fpycach?= =?UTF-8?q?e=5F=5F/crawlers.cpython-313.pyc=20=09new=20file:=20=20=20FFAI/?= =?UTF-8?q?=5F=5Fpycache=5F=5F/crawlers=5Fcore.cpython-313.pyc=20=09new=20?= =?UTF-8?q?file:=20=20=20=E7=BC=93=E5=AD=98=E6=96=87=E4=BB=B6=20=09modifie?= =?UTF-8?q?d:=20=20=20=E6=97=A7=E6=96=87=E4=BB=B6=20=09new=20file:=20=20?= =?UTF-8?q?=20=E7=88=AC=E8=99=AB=E6=96=87=E4=BB=B6=20=09modified:=20=20=20?= =?UTF-8?q?=E4=B8=BB=E6=96=87=E4=BB=B6=20=09new=20file:=20=20=20cache/cach?= =?UTF-8?q?e=5F=5FE4=5FBA=5FBA=5FE5=5FB7=5FA5=5FE6=5F99=5FBA=5FE8=5F83=5FB?= =?UTF-8?q?D=5Fjson.txt=20=09new=20file:=20=20=20=E6=B5=8B=E8=AF=95?= =?UTF-8?q?=E6=96=87=E4=BB=B6=20=09modified:=20=20=20readme.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- FFAI/__pycache__/catch.cpython-313.pyc | Bin 0 -> 3306 bytes FFAI/__pycache__/crawlers.cpython-313.pyc | Bin 6527 -> 129 bytes .../__pycache__/crawlers_core.cpython-313.pyc | Bin 0 -> 6042 bytes FFAI/catch.py | 49 ++++ FFAI/crawlers.py | 213 +++++++++--------- FFAI/crawlers_core.py | 102 +++++++++ FFAI/main.py | 16 +- ..._BA_BA_E5_B7_A5_E6_99_BA_E8_83_BD_json.txt | 0 cache/cache_bilibili_json.txt | 0 readme.md | 2 +- 10 files changed, 274 insertions(+), 108 deletions(-) create mode 100644 FFAI/__pycache__/catch.cpython-313.pyc create mode 100644 FFAI/__pycache__/crawlers_core.cpython-313.pyc create mode 100644 FFAI/catch.py create mode 100644 FFAI/crawlers_core.py create mode 100644 cache/cache__E4_BA_BA_E5_B7_A5_E6_99_BA_E8_83_BD_json.txt create mode 100644 cache/cache_bilibili_json.txt diff --git a/FFAI/__pycache__/catch.cpython-313.pyc b/FFAI/__pycache__/catch.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bfaa0716f99bd57c4d5b24da3525bb69a2fea76c GIT binary patch literal 3306 zcmb7GZ)_CD6`#G`y?;LYd^Ts_IR?%qs9Ayyt|0}NhMt`|fvAq^&8bOkX|>$#o!2?< znc2NUP#W=}P33h3( zXKZMzzO&!VyqS43JMZ^h_6))A50?XYK4F2xA^U2Jp-jOk9uhGM7#O5E1QY>vc0^>_42zokp~M$otDMf>jJBLms50ER*&c_jC8?BH`-vf zSM<=blvgM>eI0VVPx+~=$ti{2#g*fQty-JlL9_6`+y-ilyi%FA%&2-w(&OOmRoh9sH6hd-yeXogS{JqYSJ zxgKtOr((hF9VvUeZ1li1mvc8QLkBVW!&!yCZaS%AWU2P}L%0kB$4C#3x1_st1VxMe z;oW#_>qKOBGq$N%cH_3z$?ig1bM&~2UELNYi!HnCE??WmR*ZC4JjUv%07SMriU@oX zf|1<@<8FU2xi~bMpZx09SMM!Oes<^VC;7L2lRxv9+b4g0`~3U)KTa%u_}g3OzRZul zd!Nr5$)-KW_~w2+qX}-))hFwHY4woF4kUJ)PDP7n5{l{RQ;sIol%g9HcWiQlve9R9 z{TWrWf+?`n2X-33POo*`5<1cmV3A0K;y0OoDWw>a4Gzme{o0VO?Ja{TdON7&AqjSM%F4!^TUGN7+jtn0ed1LsEiRUJ}e%Cd9Y&t!= z_3HK?=ls#D?kKIq;~*;QJ82{~i5e1RsZiVqc7j0h^myjon0gN+^K% zA?DXftb`lUA+jR3mvs=0>nUNP8_#r|G?G_#iM$9oU@sRTq8oq3_yZk14P-lcsxT)3 zUzG%sZYB}4i0>V`Zp zXd#7eg4rf>gl>THH0BUQo2sQc?;`skUHBAD&r(H_^|-2^rL+CpfSK}S>E_t1v@IoLsnwYQIHdnK4dUvj-Wv2CF z?X|#xD?eMP+%WDr;hESrqg;$$tL(hTlB$Eu&Fb2FJXXF59Dw?R1%GInIKYiBaCO5| z(}8LBYSs2{d`%10p$AJmbllewwx{c&yT~QBHoAqqROiO}7H_o0b*YKPdW)rZdE0B) zOV3uc2iVI29_nj&ghyiW=n9}_46{`MJzy`%D%v&nzh5wq%bqc|h{%W0onnAxC1%Cg z|A$90uIN$Y;W~rgDvwqf(v3k|CynIiyvXB4Q1ak>Xhzk#;m~X(rTpM*0)BS~A01N~ z;Silo;JobvLS#6{6cgh`7q0ds`Ooa``bCM|HAUYAcDs7QMT_W;xgKFRqq(B46nsCH z53$>mZH0Na&b?3;}C@_ERZMLp=Gdj5l>NvR>49^FJxu7t;H5c5u zdV9S&{N_aeykD5}3)ib_mpGp%gbAR3YRio4T42v?bYWw|h~*W6owPuL{gvxPOS? z!*`x04=C`?^Geb{CXr2J-6u(hvvRu7}LTXQ_ame+D zGMO}Ofi8;Ac{3Pm2!}&4Yuv-as@rSfG6HV{9N|nc z3s`8jcknzmG2hq#bMLdy;aU0u*f68&Pk>rt8HTw*JU2+$4N~zBvhg1GG!yz40cB18 EHy8N#EdT%j literal 0 HcmV?d00001 diff --git a/FFAI/__pycache__/crawlers.cpython-313.pyc b/FFAI/__pycache__/crawlers.cpython-313.pyc index 7502fc19f69a55b3503b2286a10093731446d461..4e6ee6363654bf5a43010544968fa76eded6266f 100644 GIT binary patch literal 129 zcmey&%ge<81RuJcGonQp7#@Q-Fu(|9d}aYMrZS{6Xfpb(WGG?+@;-y4ZYj7}#kjfU zrIrIp=NJ%jbM%Z!E=nxVNi8bYE2zB1VUwGmQks)$SHud`05Y%`#Q4a}$jDg43}gWS Dj(!_V literal 6527 zcmbVQYj6|UmA)-?>n&S;$d+GN7~3Ei%gork42I!l7z|EEjWPr~t7^+FTZ}9zw_Dh_ zvQt!kWHqxh!5)U#Q<(&j$Kw3h4ZAh9MQUbhm?V|VPbnr?+b~tBT`=L-nuQI!?4LcS zTP;7{-5u26V%1`3x5Pk5%6P)5xS z)S%`@YJ}X-%O2w>=S3c4uj!bXn$L)32z=S zZ!>CZQO7Xjvpm2`pN~JKn2nE~GlP8XUU7(Ukq2@dw3Mn}nNyYd?Dv8}HUnm(% zNnvdwjk1NxqfJn_Lc(vbjf=88OSxgrXHrVOo}%K?k`M5V)}DCi z|4hs~d6@PL1JfC63FUZpz{GO{rcM?=_6$qSYKwWm0xcG`#aiIO7OUD~<4bs(I*XmR z!=X5QrAqC}P5H5UkP2flkv=HF4Hmvb{Th@Vp6R0q5Re@cBr&xENq^zA3E5U?gWlrk zQ?6LCzu%${3aEOb*R;He1R37I8+RGJWSIEaz6tw@#6&a}6WX`=x1I#SSj6$DB!?2O z!=c(zAia<3Ps~lITTVhJp0)EI#Vew&a4Fs6GtqMBSJTp z>>$cvQ680}qoTs9)S*dQ98X5+IiHbM;t-qG7>UM2#fmh9yb>O#*XyNjPlXmKspNKJh@w1c46$zu?ug!`Nqtd`BGo5)R(XD+^wvdJ~DM=zOp%2 z*_=5&SGhH9z2mB!w!Uk?NDiwRL8$W#<%pvymQ06DCoH6;L5K?6{>g9UkTDg3Vc?_FL2I^g#x0AvI4!j6KZf*friBvH23&U8@FHBo@w1evb^@Si1@ zI4?QTNQfQg8D@4EBMew66GE^yMD@&E*yt?&6Ycm2hyzAL_~Tdr(*|J7?9HygJ9n)`1F zW$@jWjf}#yPq2P@Tt+lGlZeJuG%4sF-2hyQO%ftvNEPR5ty{U9=WE?cUfiMN2XdbL z$z47D+SF^g`kv3Yx%!?tcTd`uFRz;RPx-TD>oaAUuFMOwL)p@ftffQKEp|wdg)jN)Zq(08CN9pe)Pz3j9kTz-tI?G>|x`4~YE`A#pZnTnmJFHpucEq7$f@sl%wN zq?bH{-wgbgufT6zn(;)BrbN6A|8yS=N@ZJ5ULd;aAqYkcUeXBahJD?}Mgl5lPu0OD z?|%9tgrUg~?tl1``yX9g`0)q#($_RwP?)i&TuP3#?feBgp?Ekk6par*<${6ZU0%hQ zqA`WBJvE&a$7QIqGUO8--CsG``g}%(O(exQeGxXKaAPzI@d~E`o2pzCEl*)03JV>h z5}p-+nht9vAxSt3c$d&rUhzH8GFGrQ!9-FL2!NWiY_fU2yfs(enn}%-x6Mk|U;Bge z@aJW>J@u1p-c>ejo-)t0WY|pbihbsLs4jR|YfsTQDP2&=kPXVO7%SF76M$@kA{Flj>%Y<1l8exy{Sw?uNOo zL2Cfxo>hsr=zRL@0-RCM%3FDcC2{i!FV@x^X#VeL3aR|cRXxZx7?f4haqyB-aWN@x_heoUtFGiO4SMs!)5#X_ROJM9v^^DMfLQ) zsePFp^A%fi6t?<^LuWRjD7+S7MI;eR`f3@BPoEQqP zN&u4uPzQiy+L^c+AP+5SRpL!LAOW172gtJoEddOhRwcd&jRCZ*Rf)Io)?Ez1r6sU_ zRToqtvVaHz6K_WVh})jAhC}Bo%3&SIVeS1Cd7r%>2$r33d(a%%pf0|G1g$|^(9W0m zi?Rpc{B8j_nV`A9MQ86v2vbv_SkhYvyq@SX(f1x?+Zn#p&&C~~LB1tUy>x1iE*UQg zIuN4!5h&DTCc-G3kF#HISo0DRXe)LX3rl)gJUig(Z`T=kS3d?vY6?2_y_OKLm4a+w zH(#RhlA#gamg3I+&Suh&A(fi=vM-xigi8yKiZBsUa609E6P*8yl!#B%jmUCR>S}Kv z8yoWv3elmIKb#nCm-dYfbyJKc;L2bg0sI%Kq?qwinn=kL_7QnB=GUGsCMx_?lu|L- zA8tda7XZ!S5o4qS-ahCtm_cC#g&m<{WWo+C!XP*hAuOw|2_i>Mj>_UCHaMw5zJN~CBQRz$U0c~XQA9n8CPiNbqFAU1HzY}x6eC=8 z=t*pIQ6Vacp-6&;z$MBliK&jBHfwG}oJjStb|$Y_kh zdk~sEJ%Cl~;qkDTl%t8b>V*|9Nr6Ez!pSL)Kq`)hahTEs^_48ST*U!<2#cXq5-v_6 zMVP>a1sWa+VXP0Emy(IN1n&@Vf=P->PWL1ECbFUzex07ix)!dB5+Z299~UysEpri3 z4v#3_=lEik1HVQ3E$BDI=i2pedSGfG(|px`#sA^y8{FKso^0ol8>#F|C+6IJY1>~+ z#8sbR=e?adZ|Aks^WK+p-j~6B&R1;8RcxBAovY}~Iyx8G7VB~5Pj%ZroVd|4x8rDb z`zzU+W9egesvD=j^X_*t$3AuZ%JKPNuKLyV(Y&K_-qDhCwA^%Td~70>>ocCK4Obej zIp-?&rVoLio{XlCESey`M~9tl-gV2<1s>Z|f4TJiQh4U_w9R|A=RDi5pUir;XFUfd zxqNlq^tq{XnTm^VWFoohHsB6?V?H3{0z!5$F&{|g0?C=q%lqEnH`_f|w`;!cK(6k< z=apH^`?6=pCoOqb^>oQpNxq>89Dd%r>7kkRmj8vYWfkc4cY|xMZOqp--D&XttnzBj zPilTzH`maXuWy~%^U%swmZT3Yx`?xE-qDzIG-hl!9WTNOrF;MDZmH)H;oxM}c;C6> za%VleKR$Sa`Kagm&TQ48o36v~A@kKQnoJv~>w^N+5bcgojg*YE%Bh8x0fT5ou= zjmPH7d$W$-|5>!-tWVXa!OvO`t?#jr-&t%&_8EUyzTbYN%lP{S1}eYbP>tm-4$APb zfKfX|zf19Mpgv=(FIPUM$LGX{$+$2o!o#F36dFworQp3Qq&PzG{s6BhT2E;x6oL0o z8H2-kA{3%kIGbvVBUnN>MWqFsKs{3pl;UMjafeh(jD|yk46*89N*1M1=odtNR?sxq zj;dRDH>P+qLRFV~fqaiVHn5!kKO82md(ma&Iy3wtfozt4jM<{i#BIzRTqKaq%8xNy zbX9TgOn8w%HaqkfvnN~IxZ{kr-ULia&jIO!=RpZ8GLjUMvq)41xLs&mOPf^RhgE{1 zv4n^DJ@UjNFoxsIW1?mnvjuwKP)oqbW6?nXV-=$mlUCPo4Y5UW6gV{8#w)6_9LOnN zKIva!vvE-B)No8iKwLrnhNaypmOYx__R{mv3r0>P^oxr|hGA|K_ibXmP3&KgbzhLC b+obdh((nb@@R$`D<~TEX`ag+UP^bG}j5Ue& diff --git a/FFAI/__pycache__/crawlers_core.cpython-313.pyc b/FFAI/__pycache__/crawlers_core.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d09c6e803f071601d28086fcc11f7d71e0a2dfa0 GIT binary patch literal 6042 zcmb7Idr%w6neTb&F%l0UFfRkb$Pd{@K)^4I9~gseY$Ju$*PBZJ9S z#kY4!$%zx&zR%W_?^W~(-}Twm6`XNhMGMEig{njRI7bD6r7W%^yY%?)0RiFF5HB1iVeVoXxgv zulS|G;J9o%-EV7Yb~f*~0bzIBe%r+EHpgV)7A1!`!5`5cJ6b~FL<{s;>ONl zQBo$0I(Qdpb{?^QE;RdwfAGI>#=l29L)t`4x{x-LOVe z&Me&k%gJ?6T}H{WO;`EPs=AYPP0u-|f)BG-%_#%gP#$J~#4iI2Qgv8VLRea(Vq$li zify3ADO8IFa_V$byE1KwcGIq$V#!+RQV6dBMzXe>CBw52wK5Vjpv#h6MAE$CGUXAV zyDLMISU2k`_{(uO1AEe~bXGDO0zUAi{E}Tq8@J?-BIHJ{lE0)146%PWcjl~iRF7<^ z4h^#hm^w5>Ik?aUU|9UO%ZoE_;o$k8B0H4{W$|bKymb5eB-7s1G-+yDrDG#7_d9q@ z6s+oGu#-_y)k&ff2>MhVcx1qTUNvbGG~dK(k&CR_;}3{uh_hlAENr?YY-?w=a4qR=z)c{E4M}-qH}aG|XFC zWQSa>{iE(jtG0l{H8g2E^cd| z>q=M;$IOTSl%gqP#qX`1PKAJ`n<7Qms_2w0UYd?#S7mftu>)3YI_FdMziO>~vSkrzUh#J;qPk?3FG2P(5XZZ<0) z?38PZl%SW`PIibILK#mqg(_gq!rNCCz8C%c$L}p%`^mH4+*o?^*9+hO-oo4e86s*~ zycJ1%=CklGVZ7bupfsmhBJT^*pkDEhiouYw^fB}&xsWo{xcBrVcf7yR^~al7C5vaB7r;))n)Wqk@ebluY7k}o-b;M7d1R7sgK!r z&H85Du}%9EB^@zy$J4^nYtj{IzR(^ov`2*ZcD}PSR%nkE9ttxlT4(I0!2NI4-l(0o zY>ivCMwwYxjI{R79Zy=TZymaEDB??49m(p3nFF^E{Oh6l>XvwQ%k0)fb^DhbD%l5| zD>_uZCCWud?gj4#V~q!6TMi{Q9F7$q{!$MN|83d()rtWXS1u!p>{e1AIX&AtyLndr zpmpxT`@841#Wr;%N{+_NN54ujgyn_Y0cZQLpth?3eYm6PNC)$v$_VWT_6pMO;JbMJ zLmdN@hrCu7@Lju^hvijWEzH9f4%#Y9G`J?@5O|`6oS%ZJPpwImLM~-Uv_Ma(Yx>nGT(+j+sg*5_C+fc*_6FcK%Q z>jg4`;$Wq9lJaWMu;vnU_EeJ2&8?THv*&T^zIJr|{q%xCFzy3A{;gi7n~+R23fd2j z3p`0@XXri%#6(9jE$Ic5ld(f9-5Wv!#_&P4CC{T_4)*#KK#Y9 z8U8juW1coc=g%?=gRgI_rSK7GsFb%KVk0|(#Ub1S1OjmtzXEE}XEsr}fM5DY*`dc| zrfMW{Y;-ILXyq8`Ly*IMX;@`n4f-Y3Fa(g>;|&CeazV)sgN9h}9>U7XaQ{#l#XGF( z#Q=aKNlCB45@q0S;E@ebY4{}vVTwI@r&epfUMt(lI`2Sr89go9INcvPkSJ;iv&%YE zx;NbQ`|7#~^YeYz^~vJ$`QrL`asBU#8Y$}q59Uqje(orPL@~Sn!GW2 z^R=X-@m|f{npydNX|yKp=t-0v`^3=`>73TR;fHR#?AUWXTg;~rYvlj@1uSr!k_iZT zu&c1kU{Estrjk&|9{xODh~k|5(g()4;pbVMIE#r?04tjjih($1(paaaBi9< z4tfcqufc6t4>GMrxU4P);R?`Kjo?D`S7b5dWHAEAI#&?4F~`BfC1o8j<~F!_K~H3a zr$Lj|=*;B{8ybL+k}j?sh{{qBT=-#M;@Ej5yDNm6@1#S`*;2vmq$M-(=kb!Ri`*ac z%Gr&pu+VKL*Pwx_EDON6&7eusOP0X94FKaz12tJ5w<#;d%P@|BLxGb_oulz$$>u21;O63$JDZ!+i&O8WLFMAF|LIxuharMrQ!_VTw>|yMe?5C6`s{<73%{C9=bCm~h`jpL%+>3* zg%2(--1%V^j!ssOD9V@&x5n}Dap!rje=y{PNZKSH8Xr7>Pr#(p1cfMLT1>>pfdIjn zhnR_d+Wl^o9x8{ygTXV@1SQ02S?#jNj`#bV-We3G#2Kx>`I~* zNuwkSVS!BI0Rv-29``FFs?IwGuy{~q2@;0{5fH^OOdKCKld6UA7s%d=qDKj4wlKRw z65-ZRH=uEX)6+*Yjd~|Mnd7NCO`t*icVr1(7QNUvLaxM_yff%kyedo5m}(q@4>M`F zoNkga+(zb^(yMt0MJ8fp^N{vT=Vs1FKH8|(wHZhw9)KD0FDUI6twSXnZdKi=nz!tT zTXsad6P9m;b$>Kjo|rAM(#EJ5eI-`1>#=!v(pnkrduk|%6*ok>BYR?nJ02T$CQC}g z`V?DhJVkw0z5CsZ@4fcUYx8?f#P^)|cynUUnb_`c#wuOm-lVmB-r5kiHbi<8)~4`L zt-md9-4-oPSe@ackdrEFreC>T5D7%Slc;FD&H{%9&v(yTw$h-fb&6+Clxg@+nzag_TrrI$a?>q4>#O5#A;8>SM+{T(My0n@O-sWtlbM( z(2b_ct-oZVo&TYWOvFl>ADdhLSX7oQubC=%Qc@eM?|js7-}`XO{fby!Z=&R6%zW~z zl#$GQA)kSF^Y&}~Bxc6Qbrb<&TFcIZ5E^4&GuqrFw#72HQ9I-q=1!2#u?8iurO ze9sQ%qwRG)TbSP(DQN$;sGPL7aL{%znwz8zntaRvsJBF-Jbd|tFME=ARD>^kyvH*d z91I0W+wAdN2zdjUks^<0$d6?O%v6+u9uFp$T9rK?3 str: + """生成基于查询内容的缓存文件名""" + query_hash = hashlib.md5(query.encode('utf-8')).hexdigest() + return os.path.join(self.cache_dir, f"{query_hash}.json") + + def save_to_cache(self, query: str, data: dict) -> bool: + """保存数据到缓存(带时间戳)""" + cache_data = { + 'timestamp': datetime.now().isoformat(), + 'query': query, + 'data': data + } + try: + with open(self._get_cache_path(query), 'w', encoding='utf-8') as f: + json.dump(cache_data, f, ensure_ascii=False, indent=2) + return True + except Exception as e: + print(f"缓存保存失败: {e}") + return False + + def load_from_cache(self, query: str, max_age_hours=24) -> dict: + """从缓存加载数据(可设置最大有效期)""" + cache_file = self._get_cache_path(query) + if not os.path.exists(cache_file): + return None + + try: + with open(cache_file, 'r', encoding='utf-8') as f: + cache_data = json.load(f) + + # 检查缓存有效期 + cache_time = datetime.fromisoformat(cache_data['timestamp']) + if (datetime.now() - cache_time).total_seconds() > max_age_hours * 3600: + return None + + return cache_data['data'] + except Exception as e: + print(f"缓存读取失败: {e}") + return None \ No newline at end of file diff --git a/FFAI/crawlers.py b/FFAI/crawlers.py index ed37cb3..fceb4a6 100644 --- a/FFAI/crawlers.py +++ b/FFAI/crawlers.py @@ -1,119 +1,128 @@ -import urllib.request -import os -import time -from urllib.parse import quote -from html.parser import HTMLParser -import requests # type: ignore -from bs4 import BeautifulSoup # type: ignore -from urllib.parse import quote_plus +# from typing import Self +# import urllib.request +# import os +# import time +# from urllib.parse import quote +# from html.parser import HTMLParser +# import requests # type: ignore +# from bs4 import BeautifulSoup # type: ignore +# from urllib.parse import quote_plus -class PureHTMLParser(HTMLParser): +# class PureHTMLParser(HTMLParser): - # ...(保持之前的HTML解析器代码不变)... - def __init__(self, cache_dir="cache"): - self.user_agent = "Mozilla/5.0" - # self.parser = PureHTMLParser() - self.cache_dir = cache_dir - os.makedirs(cache_dir, exist_ok=True) +# def __init__(self, cache_dir="cache"): +# self.user_agent = "Mozilla/5.0" +# # self.parser = PureHTMLParser() +# self.cache_dir = cache_dir +# os.makedirs(cache_dir, exist_ok=True) - def _is_cache_valid(self, cache_file): - """检查缓存是否有效""" - if not os.path.exists(cache_file): - return False +# def _is_cache_valid(self, cache_file): +# """检查缓存是否有效""" +# if not os.path.exists(cache_file): +# return False - file_time = os.path.getmtime(cache_file) - return (time.time() - file_time) < self.cache_expiry +# file_time = os.path.getmtime(cache_file) +# return (time.time() - file_time) < self.cache_expiry - def _get_cache_path(self, query: str) -> str: - """生成缓存文件名""" - safe_query = "".join(c if c.isalnum() else "_" for c in query) - return f"{self.cache_dir}/{safe_query}.txt" +# def _get_cache_path(self, query: str) -> str: +# """生成缓存文件名""" +# safe_query = "".join(c if c.isalnum() else "_" for c in query) +# return f"{self.cache_dir}/{safe_query}.txt" - def _save_to_cache(self, query: str, data: list): - """保存搜索结果到缓存""" - with open(self._get_cache_path(query), "w", encoding="utf-8") as f: - for item in data: - f.write(f"URL: {item['url','']}\n") - f.write(f"Text: {'abstract', item.get('text', '')}\n") - f.write("="*50 + "\n") +# def _save_to_cache(self, query: str, data: list): +# """保存搜索结果到缓存(修正版)""" +# cache_file = self._get_cache_path(query) +# try: +# with open(cache_file, "w", encoding="utf-8") as f: +# for item in data: +# # 修正点:确保item是字典且包含url键 +# url = item.get('url', '') # 安全访问 +# text = item.get('text', '') +# f.write(f"URL: {url}\n") +# f.write(f"Text: {text}\n") +# f.write("="*50 + "\n") +# except Exception as e: +# print(f"缓存保存失败: {e}") +# def _load_from_cache(self, query: str) -> list: +# """从缓存加载数据""" +# cache_file = self._get_cache_path(query) +# if not os.path.exists(cache_file): +# return None + +# with open(cache_file, "r", encoding="utf-8") as f: +# content = f.read() + +# # 解析缓存文件 +# items = [] +# for block in content.split("="*50): +# if not block.strip(): +# continue +# url = text = "" +# for line in block.split("\n"): +# if line.startswith("URL: "): +# url = line[5:] +# elif line.startswith("Text: "): +# text = line[6:] +# if url: +# items.append({"url": url, "text": text}) +# return items - def _load_from_cache(self, query: str) -> list: - """从缓存加载数据""" - cache_file = self._get_cache_path(query) - if not os.path.exists(cache_file): - return None +# def fetch(self, query, force_update=False): + - with open(cache_file, "r", encoding="utf-8") as f: - content = f.read() + +# # 确保有默认headers +# if not hasattr(self, 'headers'): +# self.headers = { +# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' +# } - # 解析缓存文件 - items = [] - for block in content.split("="*50): - if not block.strip(): - continue - url = text = "" - for line in block.split("\n"): - if line.startswith("URL: "): - url = line[5:] - elif line.startswith("Text: "): - text = line[6:] - if url: - items.append({"url": url, "text": text}) - return items - - def fetch(self, query, force_update=False): - # 确保有默认headers - if not hasattr(self, 'headers'): - self.headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' - } +# cache_file = os.path.join(self.cache_dir, f"{quote_plus(query)}.json") - cache_file = os.path.join(self.cache_dir, f"{quote_plus(query)}.json") - - # 检查缓存是否有效 - if not force_update and os.path.exists(cache_file) and self._is_cache_valid(cache_file): - return self._load_from_cache(cache_file) +# # 检查缓存是否有效 +# if not force_update and os.path.exists(cache_file) and self._is_cache_valid(cache_file): +# return self._load_from_cache(cache_file) - try: - # 实际抓取逻辑 - 以百度搜索为例 - search_url = f"https://www.baidu.com/s?wd={quote_plus(query)}" - response = requests.get(search_url, headers=self.headers, timeout=10) - response.raise_for_status() +# try: +# # 实际抓取 +# search_url = f"https://www.baidu.com/s?wd={quote_plus(query)}" +# response = requests.get(search_url, headers=self.headers, timeout=10) +# response.raise_for_status() - # 解析网页内容 - soup = BeautifulSoup(response.text, 'html.parser') - results = [] +# # 解析网页内容 +# soup = BeautifulSoup(response.text, 'html.parser') +# results = [] - # 提取搜索结果 - 百度搜索结果的实际选择器可能需要调整 - for item in soup.select('.result.c-container'): - title_elem = item.select_one('h3') - link_elem = item.find('a') - abstract_elem = item.select_one('.c-abstract') +# # 提取搜索结果 - 百度搜索结果的实际选择器可能需要调整 +# for item in soup.select('.result.c-container'): +# title_elem = item.select_one('h3') +# link_elem = item.find('a') +# abstract_elem = item.select_one('.c-abstract') - if title_elem and link_elem: - results.append({ - 'title': title_elem.get_text(strip=True), - 'url': link_elem.get('href'), - 'abstract': abstract_elem.get_text(strip=True) if abstract_elem else "" - }) +# if title_elem and link_elem: +# results.append({ +# 'title': title_elem.get_text(strip=True), +# 'url': link_elem.get('href'), +# 'abstract': abstract_elem.get_text(strip=True) if abstract_elem else "" +# }) - data = { - 'query': query, - 'results': results if results else [{'title': '无结果', 'url': '', 'abstract': ''}], - 'timestamp': int(time.time()), - 'sources': [search_url] - } +# data = { +# 'query': query, +# 'results': results if results else [{'title': '无结果', 'url': '', 'abstract': ''}], +# 'timestamp': int(time.time()), +# 'sources': [search_url] +# } - # 保存到缓存 - self._save_to_cache(cache_file, data) - return { - 'data': data, - 'sources': ["www.baidu.com"] - } +# # 保存到缓存 +# self._save_to_cache(cache_file, data) +# return { +# 'data': data, +# 'sources': ["www.baidu.com"] +# } - except Exception as e: - # 如果抓取失败但缓存存在,使用缓存 - if os.path.exists(cache_file): - print(f"抓取失败,使用缓存数据: {str(e)}") - return self._load_from_cache(cache_file) - raise RuntimeError(f"抓取失败且无缓存可用: {str(e)}") \ No newline at end of file +# except Exception as e: +# # 如果抓取失败但缓存存在,使用缓存 +# if os.path.exists(cache_file): +# print(f"抓取失败,使用缓存数据: {str(e)}") +# return self._load_from_cache(cache_file) +# raise RuntimeError(f"抓取失败且无缓存可用: {str(e)}") \ No newline at end of file diff --git a/FFAI/crawlers_core.py b/FFAI/crawlers_core.py new file mode 100644 index 0000000..af6d6ec --- /dev/null +++ b/FFAI/crawlers_core.py @@ -0,0 +1,102 @@ +import urllib.request +import urllib.robotparser +from urllib.parse import urlparse +import time +from bs4 import BeautifulSoup + +class CrawlerEngine: + def __init__(self, cache_manager): + self.cache = cache_manager + self.headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)', + 'Accept-Language': 'zh-CN,zh;q=0.9' + } + self.delay = 2 # 爬取延迟(秒) + + def _can_fetch(self, url) -> bool: + """检查robots.txt权限""" + try: + parsed = urlparse(url) + base_url = f"{parsed.scheme}://{parsed.netloc}" + rp = urllib.robotparser.RobotFileParser() + rp.set_url(f"{base_url}/robots.txt") + rp.read() + return rp.can_fetch(self.headers['User-Agent'], url) + except: + return True + + def _fetch_html(self, url) -> str: + """安全获取网页内容""" + if not self._can_fetch(url): + raise PermissionError(f"无权限爬取: {url}") + + req = urllib.request.Request(url, headers=self.headers) + try: + with urllib.request.urlopen(req, timeout=10) as response: + if response.status == 200: + return response.read().decode('utf-8') + raise ConnectionError(f"HTTP {response.status}") + except Exception as e: + raise ConnectionError(f"获取失败: {url} - {str(e)}") + + def _extract_content(self, html: str) -> dict: + """从HTML提取结构化数据""" + soup = BeautifulSoup(html, 'html.parser') + + # 移除不需要的标签 + for tag in ['script', 'style', 'nav', 'footer']: + for element in soup(tag): + element.decompose() + + # 提取核心内容 + title = soup.title.string if soup.title else '' + text = ' '.join(p.get_text() for p in soup.find_all('p')) + + return { + 'title': title.strip(), + 'content': text.strip(), + 'links': [a['href'] for a in soup.find_all('a', href=True)] + } + + def crawl(self, query: str, max_results=5) -> dict: + """执行完整爬取流程""" + # 先检查缓存 + cached = self.cache.load_from_cache(query) + if cached: + print(f"使用缓存数据: {query}") + return cached + + print(f"开始爬取: {query}") + results = [] + + try: + # 模拟搜索引擎查询(示例使用百度) + search_url = f"https://www.baidu.com/s?wd={urllib.parse.quote(query)}" + html = self._fetch_html(search_url) + data = self._extract_content(html) + + # 限制抓取数量并添加延迟 + for link in data['links'][:max_results]: + if link.startswith('http'): + try: + page_html = self._fetch_html(link) + page_data = self._extract_content(page_html) + results.append({ + 'source_url': link, + 'title': page_data['title'], + 'content': page_data['content'] + }) + time.sleep(self.delay) + except Exception as e: + print(f"子页面抓取失败: {link} - {str(e)}") + + # 保存结果到缓存 + result_data = {'query': query, 'results': results} + self.cache.save_to_cache(query, result_data) + return result_data + + except Exception as e: + print(f"爬取失败: {str(e)}") + if cached: + return cached + raise RuntimeError(f"爬取失败且无缓存可用: {str(e)}") \ No newline at end of file diff --git a/FFAI/main.py b/FFAI/main.py index 11a846d..80c0d2a 100644 --- a/FFAI/main.py +++ b/FFAI/main.py @@ -1,14 +1,19 @@ -from crawlers import PureHTMLParser # type: ignore +# from crawlers import PureHTMLParser # type: ignore from analyzer import PureAnalyzer # type: ignore +from crawlers_core import CrawlerEngine +from catch import CacheManager + class PureInfoHunter: def __init__(self): - self.crawler = PureHTMLParser() + self.cache_manager = CacheManager() + self.crawler = CrawlerEngine(self.cache_manager) self.analyzer = PureAnalyzer() + self.catch = CacheManager() def run(self, query: str): # 1. 获取数据(优先缓存) - data = self.crawler.fetch(query) + data = self.catch(query) # 2. 分析(自动检索历史缓存) result = self.analyzer.analyze(data, query) @@ -46,16 +51,17 @@ if __name__ == "__main__": print("使用方法: python pure_main.py '搜索关键词' [force_update]") print("示例: python pure_main.py '人工智能' true") query = input("请输入要搜索的关键词: ") # 改为交互式输入 + force_update = input("是否强制更新(true/false)? ").lower() == "true" else: - query = sys.argv[1] + # query = sys.argv[1] force_update = len(sys.argv) > 2 and sys.argv[2].lower() == "true" hunter = PureInfoHunter() if force_update: print("强制更新模式(忽略缓存)") - data = hunter.crawler.fetch(query) # 使用实际存在的方法名 + data = hunter.crawler.crawl(query) # 使用实际存在的方法名 result = hunter.analyzer.analyze(data, query) else: result = hunter.run(query) diff --git a/cache/cache__E4_BA_BA_E5_B7_A5_E6_99_BA_E8_83_BD_json.txt b/cache/cache__E4_BA_BA_E5_B7_A5_E6_99_BA_E8_83_BD_json.txt new file mode 100644 index 0000000..e69de29 diff --git a/cache/cache_bilibili_json.txt b/cache/cache_bilibili_json.txt new file mode 100644 index 0000000..e69de29 diff --git a/readme.md b/readme.md index 3105392..6a84d1c 100644 --- a/readme.md +++ b/readme.md @@ -8,7 +8,7 @@ Windows:CMD到FFAIall或者FFAInobug文件夹下然后使用***python main.py 你要问的内容*** -# 通知 +# 通知:可以正常运行了 ## 调试版本