更新爬虫逻辑,运行前先删除旧数据。结束后,调用入库的脚本。

This commit is contained in:
2025-11-12 21:00:49 +08:00
parent 5f05a62419
commit d5344aaa4a
5 changed files with 6995 additions and 2460 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@@ -11,6 +11,8 @@ import json
import time
import os
import re
import subprocess
import sys
from datetime import datetime
from loguru import logger
@@ -71,6 +73,32 @@ class TopHubScraper:
logger.error(f"获取网页内容失败: {e}")
raise
def delete_date_txt_files(self):
"""
删除本地目录下所有以日期格式开头的txt文件
匹配格式: YYYY年MM月DD日HHMMSS.txt
"""
logger.info("开始删除日期格式的txt文件")
deleted_count = 0
# 定义日期格式的正则表达式模式
date_pattern = r'^\d{4}\d{1,2}月\d{1,2}日\d{6}\.txt$'
try:
# 获取当前目录下的所有txt文件
for filename in os.listdir('.'):
if filename.endswith('.txt') and re.match(date_pattern, filename):
try:
os.remove(filename)
logger.info(f"已删除文件: {filename}")
deleted_count += 1
except Exception as e:
logger.error(f"删除文件 {filename} 失败: {e}")
logger.info(f"删除完成,共删除 {deleted_count} 个日期格式的txt文件")
except Exception as e:
logger.error(f"删除文件时出错: {e}")
def scrape_by_node_ids(self):
"""
根据节点ID范围抓取数据
@@ -79,6 +107,9 @@ class TopHubScraper:
list: 包含已抓取数据的列表
"""
try:
# 运行逻辑前先删除所有日期格式的txt文件
self.delete_date_txt_files()
# 1. 获取网页内容
html_content = self.fetch_webpage()
tree = html.fromstring(html_content)
@@ -204,6 +235,49 @@ class TopHubScraper:
logger.error(f"保存文件失败: {e}")
raise
def call_add_data_script(self):
"""
调用本地的tophub_add_data_to_db.py脚本
"""
logger.info("准备调用tophub_add_data_to_db.py脚本")
try:
# 检查tophub_add_data_to_db.py是否存在
if not os.path.exists("tophub_add_data_to_db.py"):
logger.error("tophub_add_data_to_db.py文件不存在无法调用")
return
# 调用tophub_add_data_to_db.py脚本
logger.info("正在调用tophub_add_data_to_db.py...")
result = subprocess.run([sys.executable, "tophub_add_data_to_db.py"],
capture_output=True, text=True, encoding='utf-8')
if result.returncode == 0:
logger.info("tophub_add_data_to_db.py调用成功")
if result.stdout:
logger.info(f"脚本输出: {result.stdout}")
else:
logger.error(f"tophub_add_data_to_db.py调用失败返回码: {result.returncode}")
if result.stderr:
logger.error(f"错误信息: {result.stderr}")
if result.stdout:
logger.info(f"脚本输出: {result.stdout}")
except Exception as e:
logger.error(f"调用tophub_add_data_to_db.py时出错: {e}")
if __name__ == "__main__":
scraper = TopHubScraper()
scraper.scrape_by_node_ids()
try:
# 抓取数据
scraped_data = scraper.scrape_by_node_ids()
# 抓取完成后调用tophub_add_data_to_db.py脚本
if scraped_data:
scraper.call_add_data_script()
else:
logger.warning("未抓取到数据跳过调用tophub_add_data_to_db.py脚本")
except Exception as e:
logger.error(f"程序执行出错: {e}")
raise