modified: FFAInobug/crawlers.py
modified: FFAInobug/main.py
This commit is contained in:
parent
b04a959a27
commit
b1652bc1c0
@ -10,11 +10,9 @@ from urllib.parse import quote_plus
|
|||||||
class PureHTMLParser(HTMLParser):
|
class PureHTMLParser(HTMLParser):
|
||||||
|
|
||||||
# ...(保持之前的HTML解析器代码不变)...
|
# ...(保持之前的HTML解析器代码不变)...
|
||||||
|
|
||||||
class PureCrawler:
|
|
||||||
def __init__(self, cache_dir="cache"):
|
def __init__(self, cache_dir="cache"):
|
||||||
self.user_agent = "Mozilla/5.0"
|
self.user_agent = "Mozilla/5.0"
|
||||||
self.parser = PureHTMLParser()
|
# self.parser = PureHTMLParser()
|
||||||
self.cache_dir = cache_dir
|
self.cache_dir = cache_dir
|
||||||
os.makedirs(cache_dir, exist_ok=True)
|
os.makedirs(cache_dir, exist_ok=True)
|
||||||
|
|
||||||
@ -35,8 +33,8 @@ class PureHTMLParser(HTMLParser):
|
|||||||
"""保存搜索结果到缓存"""
|
"""保存搜索结果到缓存"""
|
||||||
with open(self._get_cache_path(query), "w", encoding="utf-8") as f:
|
with open(self._get_cache_path(query), "w", encoding="utf-8") as f:
|
||||||
for item in data:
|
for item in data:
|
||||||
f.write(f"URL: {item['url']}\n")
|
f.write(f"URL: {item['url','']}\n")
|
||||||
f.write(f"Text: {item['text']}\n")
|
f.write(f"Text: {'abstract', item.get('text', '')}\n")
|
||||||
f.write("="*50 + "\n")
|
f.write("="*50 + "\n")
|
||||||
|
|
||||||
def _load_from_cache(self, query: str) -> list:
|
def _load_from_cache(self, query: str) -> list:
|
||||||
@ -64,11 +62,16 @@ class PureHTMLParser(HTMLParser):
|
|||||||
return items
|
return items
|
||||||
|
|
||||||
def fetch(self, query, force_update=False):
|
def fetch(self, query, force_update=False):
|
||||||
|
# 确保有默认headers
|
||||||
|
if not hasattr(self, 'headers'):
|
||||||
|
self.headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||||
|
}
|
||||||
|
|
||||||
cache_file = os.path.join(self.cache_dir, f"{quote_plus(query)}.json")
|
cache_file = os.path.join(self.cache_dir, f"{quote_plus(query)}.json")
|
||||||
|
|
||||||
# 检查缓存是否有效
|
# 检查缓存是否有效
|
||||||
if not force_update and self._is_cache_valid(cache_file):
|
if not force_update and os.path.exists(cache_file) and self._is_cache_valid(cache_file):
|
||||||
return self._load_from_cache(cache_file)
|
return self._load_from_cache(cache_file)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -79,29 +82,34 @@ class PureHTMLParser(HTMLParser):
|
|||||||
|
|
||||||
# 解析网页内容
|
# 解析网页内容
|
||||||
soup = BeautifulSoup(response.text, 'html.parser')
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
results = [(query)]
|
results = []
|
||||||
|
|
||||||
# 提取搜索结果 - 根据实际网站结构调整
|
# 提取搜索结果 - 百度搜索结果的实际选择器可能需要调整
|
||||||
for item in soup.select('.result.c-container'):
|
for item in soup.select('.result.c-container'):
|
||||||
title = item.select_one('h3').get_text(strip=True)
|
title_elem = item.select_one('h3')
|
||||||
link = item.find('a')['href']
|
link_elem = item.find('a')
|
||||||
abstract = item.select_one('.c-abstract').get_text(strip=True) if item.select_one('.c-abstract') else ""
|
abstract_elem = item.select_one('.c-abstract')
|
||||||
results.append({
|
|
||||||
'title': title,
|
if title_elem and link_elem:
|
||||||
'url': link,
|
results.append({
|
||||||
'abstract': abstract
|
'title': title_elem.get_text(strip=True),
|
||||||
})
|
'url': link_elem.get('href'),
|
||||||
|
'abstract': abstract_elem.get_text(strip=True) if abstract_elem else ""
|
||||||
|
})
|
||||||
|
|
||||||
data = {
|
data = {
|
||||||
'query': query,
|
'query': query,
|
||||||
'results': results,
|
'results': results if results else [{'title': '无结果', 'url': '', 'abstract': ''}],
|
||||||
'timestamp': int(time.time()),
|
'timestamp': int(time.time()),
|
||||||
'sources': [search_url]
|
'sources': [search_url]
|
||||||
}
|
}
|
||||||
|
|
||||||
# 保存到缓存
|
# 保存到缓存
|
||||||
self._save_to_cache(cache_file, data)
|
self._save_to_cache(cache_file, data)
|
||||||
return data
|
return {
|
||||||
|
'data': data,
|
||||||
|
'sources': ["www.baidu.com"]
|
||||||
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# 如果抓取失败但缓存存在,使用缓存
|
# 如果抓取失败但缓存存在,使用缓存
|
||||||
|
@ -55,7 +55,7 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
if force_update:
|
if force_update:
|
||||||
print("强制更新模式(忽略缓存)")
|
print("强制更新模式(忽略缓存)")
|
||||||
data = hunter.crawler.fetch(query, force_update=True)
|
data = hunter.crawler.fetch(query) # 使用实际存在的方法名
|
||||||
result = hunter.analyzer.analyze(data, query)
|
result = hunter.analyzer.analyze(data, query)
|
||||||
else:
|
else:
|
||||||
result = hunter.run(query)
|
result = hunter.run(query)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user