更新爬虫逻辑,运行前先删除旧数据。结束后,调用入库的脚本。
This commit is contained in:
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
BIN
tophub_data.db
BIN
tophub_data.db
Binary file not shown.
1094
tophub_scraper.log
1094
tophub_scraper.log
File diff suppressed because it is too large
Load Diff
@@ -11,6 +11,8 @@ import json
|
|||||||
import time
|
import time
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
@@ -71,6 +73,32 @@ class TopHubScraper:
|
|||||||
logger.error(f"获取网页内容失败: {e}")
|
logger.error(f"获取网页内容失败: {e}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
def delete_date_txt_files(self):
|
||||||
|
"""
|
||||||
|
删除本地目录下所有以日期格式开头的txt文件
|
||||||
|
匹配格式: YYYY年MM月DD日HHMMSS.txt
|
||||||
|
"""
|
||||||
|
logger.info("开始删除日期格式的txt文件")
|
||||||
|
deleted_count = 0
|
||||||
|
|
||||||
|
# 定义日期格式的正则表达式模式
|
||||||
|
date_pattern = r'^\d{4}年\d{1,2}月\d{1,2}日\d{6}\.txt$'
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 获取当前目录下的所有txt文件
|
||||||
|
for filename in os.listdir('.'):
|
||||||
|
if filename.endswith('.txt') and re.match(date_pattern, filename):
|
||||||
|
try:
|
||||||
|
os.remove(filename)
|
||||||
|
logger.info(f"已删除文件: {filename}")
|
||||||
|
deleted_count += 1
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"删除文件 {filename} 失败: {e}")
|
||||||
|
|
||||||
|
logger.info(f"删除完成,共删除 {deleted_count} 个日期格式的txt文件")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"删除文件时出错: {e}")
|
||||||
|
|
||||||
def scrape_by_node_ids(self):
|
def scrape_by_node_ids(self):
|
||||||
"""
|
"""
|
||||||
根据节点ID范围抓取数据
|
根据节点ID范围抓取数据
|
||||||
@@ -79,6 +107,9 @@ class TopHubScraper:
|
|||||||
list: 包含已抓取数据的列表
|
list: 包含已抓取数据的列表
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
|
# 运行逻辑前,先删除所有日期格式的txt文件
|
||||||
|
self.delete_date_txt_files()
|
||||||
|
|
||||||
# 1. 获取网页内容
|
# 1. 获取网页内容
|
||||||
html_content = self.fetch_webpage()
|
html_content = self.fetch_webpage()
|
||||||
tree = html.fromstring(html_content)
|
tree = html.fromstring(html_content)
|
||||||
@@ -204,6 +235,49 @@ class TopHubScraper:
|
|||||||
logger.error(f"保存文件失败: {e}")
|
logger.error(f"保存文件失败: {e}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
def call_add_data_script(self):
|
||||||
|
"""
|
||||||
|
调用本地的tophub_add_data_to_db.py脚本
|
||||||
|
"""
|
||||||
|
logger.info("准备调用tophub_add_data_to_db.py脚本")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 检查tophub_add_data_to_db.py是否存在
|
||||||
|
if not os.path.exists("tophub_add_data_to_db.py"):
|
||||||
|
logger.error("tophub_add_data_to_db.py文件不存在,无法调用")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 调用tophub_add_data_to_db.py脚本
|
||||||
|
logger.info("正在调用tophub_add_data_to_db.py...")
|
||||||
|
result = subprocess.run([sys.executable, "tophub_add_data_to_db.py"],
|
||||||
|
capture_output=True, text=True, encoding='utf-8')
|
||||||
|
|
||||||
|
if result.returncode == 0:
|
||||||
|
logger.info("tophub_add_data_to_db.py调用成功")
|
||||||
|
if result.stdout:
|
||||||
|
logger.info(f"脚本输出: {result.stdout}")
|
||||||
|
else:
|
||||||
|
logger.error(f"tophub_add_data_to_db.py调用失败,返回码: {result.returncode}")
|
||||||
|
if result.stderr:
|
||||||
|
logger.error(f"错误信息: {result.stderr}")
|
||||||
|
if result.stdout:
|
||||||
|
logger.info(f"脚本输出: {result.stdout}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"调用tophub_add_data_to_db.py时出错: {e}")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
scraper = TopHubScraper()
|
scraper = TopHubScraper()
|
||||||
scraper.scrape_by_node_ids()
|
try:
|
||||||
|
# 抓取数据
|
||||||
|
scraped_data = scraper.scrape_by_node_ids()
|
||||||
|
|
||||||
|
# 抓取完成后调用tophub_add_data_to_db.py脚本
|
||||||
|
if scraped_data:
|
||||||
|
scraper.call_add_data_script()
|
||||||
|
else:
|
||||||
|
logger.warning("未抓取到数据,跳过调用tophub_add_data_to_db.py脚本")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"程序执行出错: {e}")
|
||||||
|
raise
|
||||||
Reference in New Issue
Block a user