FC/FFAI/crawlers.py
Friendfeng 925c5e166b new file: FFAI/__pycache__/analyzer.cpython-313.pyc
new file:   FFAI/__pycache__/crawlers.cpython-313.pyc
	new file:   FFAI/analyzer.py
	new file:   FFAI/crawlers.py
	new file:   FFAI/main.py
	renamed:    main/build/newtest/Analysis-00.toc -> test/build/newtest/Analysis-00.toc
	renamed:    main/build/newtest/COLLECT-00.toc -> test/build/newtest/COLLECT-00.toc
	renamed:    main/build/newtest/EXE-00.toc -> test/build/newtest/EXE-00.toc
	renamed:    main/build/newtest/PKG-00.toc -> test/build/newtest/PKG-00.toc
	renamed:    main/build/newtest/PYZ-00.pyz -> test/build/newtest/PYZ-00.pyz
	renamed:    main/build/newtest/PYZ-00.toc -> test/build/newtest/PYZ-00.toc
	renamed:    main/build/newtest/base_library.zip -> test/build/newtest/base_library.zip
	renamed:    main/build/newtest/localpycs/pyimod01_archive.pyc -> test/build/newtest/localpycs/pyimod01_archive.pyc
	renamed:    main/build/newtest/localpycs/pyimod02_importers.pyc -> test/build/newtest/localpycs/pyimod02_importers.pyc
	renamed:    main/build/newtest/localpycs/pyimod03_ctypes.pyc -> test/build/newtest/localpycs/pyimod03_ctypes.pyc
	renamed:    main/build/newtest/localpycs/pyimod04_pywin32.pyc -> test/build/newtest/localpycs/pyimod04_pywin32.pyc
	renamed:    main/build/newtest/localpycs/struct.pyc -> test/build/newtest/localpycs/struct.pyc
	renamed:    main/build/newtest/newtest.exe -> test/build/newtest/newtest.exe
	renamed:    main/build/newtest/newtest.pkg -> test/build/newtest/newtest.pkg
	renamed:    main/build/newtest/warn-newtest.txt -> test/build/newtest/warn-newtest.txt
	renamed:    main/build/newtest/xref-newtest.html -> test/build/newtest/xref-newtest.html
	new file:   test/dist/newtest/_internal/VCRUNTIME140.dll
	renamed:    main/dist/newtest/_internal/_bz2.pyd -> test/dist/newtest/_internal/_bz2.pyd
	renamed:    main/dist/newtest/_internal/_decimal.pyd -> test/dist/newtest/_internal/_decimal.pyd
	renamed:    main/dist/newtest/_internal/_hashlib.pyd -> test/dist/newtest/_internal/_hashlib.pyd
	renamed:    main/dist/newtest/_internal/_lzma.pyd -> test/dist/newtest/_internal/_lzma.pyd
	renamed:    main/dist/newtest/_internal/_socket.pyd -> test/dist/newtest/_internal/_socket.pyd
	renamed:    main/dist/newtest/_internal/api-ms-win-core-console-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-core-console-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-core-datetime-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-core-datetime-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-core-debug-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-core-debug-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-core-errorhandling-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-core-errorhandling-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-core-fibers-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-core-fibers-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-core-file-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-core-file-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-core-file-l1-2-0.dll -> test/dist/newtest/_internal/api-ms-win-core-file-l1-2-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-core-file-l2-1-0.dll -> test/dist/newtest/_internal/api-ms-win-core-file-l2-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-core-handle-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-core-handle-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-core-heap-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-core-heap-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-core-interlocked-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-core-interlocked-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-core-libraryloader-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-core-libraryloader-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-core-localization-l1-2-0.dll -> test/dist/newtest/_internal/api-ms-win-core-localization-l1-2-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-core-memory-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-core-memory-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-core-namedpipe-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-core-namedpipe-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-core-processenvironment-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-core-processenvironment-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-core-processthreads-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-core-processthreads-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-core-processthreads-l1-1-1.dll -> test/dist/newtest/_internal/api-ms-win-core-processthreads-l1-1-1.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-core-profile-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-core-profile-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-core-rtlsupport-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-core-rtlsupport-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-core-string-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-core-string-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-core-synch-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-core-synch-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-core-synch-l1-2-0.dll -> test/dist/newtest/_internal/api-ms-win-core-synch-l1-2-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-core-sysinfo-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-core-sysinfo-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-core-timezone-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-core-timezone-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-core-util-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-core-util-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-crt-conio-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-crt-conio-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-crt-convert-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-crt-convert-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-crt-environment-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-crt-environment-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-crt-filesystem-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-crt-filesystem-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-crt-heap-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-crt-heap-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-crt-locale-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-crt-locale-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-crt-math-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-crt-math-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-crt-process-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-crt-process-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-crt-runtime-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-crt-runtime-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-crt-stdio-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-crt-stdio-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-crt-string-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-crt-string-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-crt-time-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-crt-time-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/api-ms-win-crt-utility-l1-1-0.dll -> test/dist/newtest/_internal/api-ms-win-crt-utility-l1-1-0.dll
	renamed:    main/dist/newtest/_internal/base_library.zip -> test/dist/newtest/_internal/base_library.zip
	renamed:    main/dist/newtest/_internal/libcrypto-3.dll -> test/dist/newtest/_internal/libcrypto-3.dll
	renamed:    main/dist/newtest/_internal/python313.dll -> test/dist/newtest/_internal/python313.dll
	renamed:    main/dist/newtest/_internal/select.pyd -> test/dist/newtest/_internal/select.pyd
	renamed:    main/dist/newtest/_internal/ucrtbase.dll -> test/dist/newtest/_internal/ucrtbase.dll
	renamed:    main/dist/newtest/_internal/unicodedata.pyd -> test/dist/newtest/_internal/unicodedata.pyd
	renamed:    main/dist/newtest/newtest.exe -> test/dist/newtest/newtest.exe
	renamed:    main/newtest.py -> test/newtest.py
	renamed:    main/newtest.spec -> test/newtest.spec
2025-06-06 23:33:48 +08:00

71 lines
2.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import urllib.request
import os
import time
from urllib.parse import quote
from html.parser import HTMLParser
class PureHTMLParser(HTMLParser):
# ...保持之前的HTML解析器代码不变...
class PureCrawler:
def __init__(self, cache_dir="cache"):
self.user_agent = "Mozilla/5.0"
self.parser = PureHTMLParser()
self.cache_dir = cache_dir
os.makedirs(cache_dir, exist_ok=True)
def _get_cache_path(self, query: str) -> str:
"""生成缓存文件名"""
safe_query = "".join(c if c.isalnum() else "_" for c in query)
return f"{self.cache_dir}/{safe_query}.txt"
def _save_to_cache(self, query: str, data: list):
"""保存搜索结果到缓存"""
with open(self._get_cache_path(query), "w", encoding="utf-8") as f:
for item in data:
f.write(f"URL: {item['url']}\n")
f.write(f"Text: {item['text']}\n")
f.write("="*50 + "\n")
def _load_from_cache(self, query: str) -> list:
"""从缓存加载数据"""
cache_file = self._get_cache_path(query)
if not os.path.exists(cache_file):
return None
with open(cache_file, "r", encoding="utf-8") as f:
content = f.read()
# 解析缓存文件
items = []
for block in content.split("="*50):
if not block.strip():
continue
url = text = ""
for line in block.split("\n"):
if line.startswith("URL: "):
url = line[5:]
elif line.startswith("Text: "):
text = line[6:]
if url:
items.append({"url": url, "text": text})
return items
def fetch(self, query: str, force_update=False) -> list:
"""优先读取缓存,不存在时爬取"""
if not force_update:
cached = self._load_from_cache(query)
if cached:
print("📂 从缓存加载数据")
return cached
print("🌐 正在爬取网络数据...")
# ...(保持原有的爬取逻辑)...
data = [{"url": link, "text": self.extract_text(link)} for link in self.parser.links[:5]]
self._save_to_cache(query, data)
return data
def extract_text(self, url: str) -> str:
# ...(保持原有的正文提取逻辑)..
return extracted_text