更新爬虫逻辑,运行前先删除旧数据。结束后,调用入库的脚本。
This commit is contained in:
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
BIN
tophub_data.db
BIN
tophub_data.db
Binary file not shown.
1094
tophub_scraper.log
1094
tophub_scraper.log
File diff suppressed because it is too large
Load Diff
@@ -11,6 +11,8 @@ import json
|
||||
import time
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from loguru import logger
|
||||
|
||||
@@ -71,6 +73,32 @@ class TopHubScraper:
|
||||
logger.error(f"获取网页内容失败: {e}")
|
||||
raise
|
||||
|
||||
def delete_date_txt_files(self):
|
||||
"""
|
||||
删除本地目录下所有以日期格式开头的txt文件
|
||||
匹配格式: YYYY年MM月DD日HHMMSS.txt
|
||||
"""
|
||||
logger.info("开始删除日期格式的txt文件")
|
||||
deleted_count = 0
|
||||
|
||||
# 定义日期格式的正则表达式模式
|
||||
date_pattern = r'^\d{4}年\d{1,2}月\d{1,2}日\d{6}\.txt$'
|
||||
|
||||
try:
|
||||
# 获取当前目录下的所有txt文件
|
||||
for filename in os.listdir('.'):
|
||||
if filename.endswith('.txt') and re.match(date_pattern, filename):
|
||||
try:
|
||||
os.remove(filename)
|
||||
logger.info(f"已删除文件: {filename}")
|
||||
deleted_count += 1
|
||||
except Exception as e:
|
||||
logger.error(f"删除文件 {filename} 失败: {e}")
|
||||
|
||||
logger.info(f"删除完成,共删除 {deleted_count} 个日期格式的txt文件")
|
||||
except Exception as e:
|
||||
logger.error(f"删除文件时出错: {e}")
|
||||
|
||||
def scrape_by_node_ids(self):
|
||||
"""
|
||||
根据节点ID范围抓取数据
|
||||
@@ -79,6 +107,9 @@ class TopHubScraper:
|
||||
list: 包含已抓取数据的列表
|
||||
"""
|
||||
try:
|
||||
# 运行逻辑前,先删除所有日期格式的txt文件
|
||||
self.delete_date_txt_files()
|
||||
|
||||
# 1. 获取网页内容
|
||||
html_content = self.fetch_webpage()
|
||||
tree = html.fromstring(html_content)
|
||||
@@ -204,6 +235,49 @@ class TopHubScraper:
|
||||
logger.error(f"保存文件失败: {e}")
|
||||
raise
|
||||
|
||||
def call_add_data_script(self):
|
||||
"""
|
||||
调用本地的tophub_add_data_to_db.py脚本
|
||||
"""
|
||||
logger.info("准备调用tophub_add_data_to_db.py脚本")
|
||||
|
||||
try:
|
||||
# 检查tophub_add_data_to_db.py是否存在
|
||||
if not os.path.exists("tophub_add_data_to_db.py"):
|
||||
logger.error("tophub_add_data_to_db.py文件不存在,无法调用")
|
||||
return
|
||||
|
||||
# 调用tophub_add_data_to_db.py脚本
|
||||
logger.info("正在调用tophub_add_data_to_db.py...")
|
||||
result = subprocess.run([sys.executable, "tophub_add_data_to_db.py"],
|
||||
capture_output=True, text=True, encoding='utf-8')
|
||||
|
||||
if result.returncode == 0:
|
||||
logger.info("tophub_add_data_to_db.py调用成功")
|
||||
if result.stdout:
|
||||
logger.info(f"脚本输出: {result.stdout}")
|
||||
else:
|
||||
logger.error(f"tophub_add_data_to_db.py调用失败,返回码: {result.returncode}")
|
||||
if result.stderr:
|
||||
logger.error(f"错误信息: {result.stderr}")
|
||||
if result.stdout:
|
||||
logger.info(f"脚本输出: {result.stdout}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"调用tophub_add_data_to_db.py时出错: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
scraper = TopHubScraper()
|
||||
scraper.scrape_by_node_ids()
|
||||
try:
|
||||
# 抓取数据
|
||||
scraped_data = scraper.scrape_by_node_ids()
|
||||
|
||||
# 抓取完成后调用tophub_add_data_to_db.py脚本
|
||||
if scraped_data:
|
||||
scraper.call_add_data_script()
|
||||
else:
|
||||
logger.warning("未抓取到数据,跳过调用tophub_add_data_to_db.py脚本")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"程序执行出错: {e}")
|
||||
raise
|
||||
Reference in New Issue
Block a user