第一次提交。

其中爬取是tophub_scraper.py
数据入库是 tophub_add_data_to_db.py
查看当前数据内容是 db_viewer.py
This commit is contained in:
2025-11-09 17:20:44 +08:00
commit 25da264413
29 changed files with 28508 additions and 0 deletions

213
tophub_add_data_to_db.py Normal file
View File

@@ -0,0 +1,213 @@
#!/usr/bin/env python3
"""
处理临时文件并写入数据库的脚本
读取指定格式的临时文件提取标题和链接调用API进行分类然后写入SQLite数据库
"""
import sqlite3
import requests
import os
import re
from datetime import datetime
from tqdm import tqdm
from loguru import logger
import glob
# 配置日志
logger.add("tophub_add_data_to_db.log", rotation="10 MB", level="INFO")
# API配置
API_URL = "http://localhost:11434/api/generate"
API_MODEL = "gemma3:4b"
def init_database():
"""初始化数据库,创建表结构"""
conn = sqlite3.connect('tophub_data.db')
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS articles (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT NOT NULL,
url TEXT NOT NULL,
category TEXT,
source_date TEXT NOT NULL,
created_at TEXT NOT NULL,
UNIQUE(title, source_date)
)
''')
conn.commit()
conn.close()
logger.info("数据库初始化完成")
def find_temp_files():
"""查找符合格式的临时文件"""
pattern = "*年*月*日*.txt"
files = glob.glob(pattern)
logger.info(f"找到 {len(files)} 个临时文件: {files}")
return files
def parse_file_content(file_path):
"""解析文件内容按5行一个循环提取数据"""
articles = []
try:
with open(file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
# 按5行一组进行解析
for i in range(0, len(lines), 5):
if i + 4 < len(lines):
node_id = lines[i].strip()
category = lines[i+1].strip()
title = lines[i+2].strip()
url = lines[i+3].strip()
separator = lines[i+4].strip() if i+4 < len(lines) else ""
# 提取关键信息
title_match = re.search(r'标题: (.+)', title)
url_match = re.search(r'链接: (.+)', url)
if title_match and url_match:
articles.append({
'title': title_match.group(1),
'url': url_match.group(1),
'category': category.split(': ')[1] if ': ' in category else '未知'
})
logger.info(f"从文件 {file_path} 解析出 {len(articles)} 条数据")
return articles
except Exception as e:
logger.error(f"解析文件 {file_path} 失败: {e}")
return []
def check_duplicate(title, date_str):
"""检查标题+日期是否已存在"""
conn = sqlite3.connect('tophub_data.db')
cursor = conn.cursor()
cursor.execute('''
SELECT COUNT(*) FROM articles
WHERE title = ? AND source_date = ?
''', (title, date_str))
count = cursor.fetchone()[0]
conn.close()
return count > 0
def classify_title(title):
"""调用API对标题进行分类"""
try:
prompt = f"目标:对以下文字内容进行分类,返回结果为类别,如\"社会新闻\"\"金融\"\"历史\"\"购物\"\"新质科技\"等等。目的只返回2-4个字不返回其它内容。内容{title}"
data = {
"model": API_MODEL,
"prompt": prompt,
"stream": False
}
response = requests.post(API_URL, json=data, timeout=30)
response.raise_for_status()
result = response.json()
category = result.get('response', '').strip()
# 验证分类结果长度
if len(category) < 2 or len(category) > 8:
category = '其他'
logger.info(f"标题 '{title}' 分类为: {category}")
return category
except Exception as e:
logger.error(f"API调用失败标题 '{title}': {e}")
return '其他'
def insert_article(title, url, category, source_date):
"""插入文章到数据库"""
conn = sqlite3.connect('tophub_data.db')
cursor = conn.cursor()
try:
created_at = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
cursor.execute('''
INSERT INTO articles (title, url, category, source_date, created_at)
VALUES (?, ?, ?, ?, ?)
''', (title, url, category, source_date, created_at))
conn.commit()
logger.info(f"成功插入文章: {title}")
return True
except sqlite3.IntegrityError:
logger.warning(f"文章已存在,跳过: {title}")
return False
except Exception as e:
logger.error(f"插入文章失败: {e}")
return False
finally:
conn.close()
def process_temp_files():
"""主处理函数"""
logger.info("开始处理临时文件...")
# 初始化数据库
init_database()
# 查找临时文件
temp_files = find_temp_files()
if not temp_files:
logger.warning("未找到临时文件")
return
total_processed = 0
total_inserted = 0
# 处理每个文件
for file_path in temp_files:
logger.info(f"处理文件: {file_path}")
# 从文件名提取日期
date_match = re.search(r'(\d{4})年(\d{1,2})月(\d{1,2})日', file_path)
if date_match:
source_date = f"{date_match.group(1)}-{int(date_match.group(2)):02d}-{int(date_match.group(3)):02d}"
else:
source_date = datetime.now().strftime('%Y-%m-%d')
# 解析文件内容
articles = parse_file_content(file_path)
if not articles:
continue
# 处理每篇文章
for article in tqdm(articles, desc=f"处理 {file_path}"):
total_processed += 1
# 检查重复
if check_duplicate(article['title'], source_date):
logger.info(f"跳过重复文章: {article['title']}")
continue
# 分类标题
category = classify_title(article['title'])
# 插入数据库
if insert_article(article['title'], article['url'], category, source_date):
total_inserted += 1
logger.info(f"处理完成! 总计处理: {total_processed}, 成功插入: {total_inserted}")
if __name__ == "__main__":
try:
process_temp_files()
except Exception as e:
logger.error(f"程序执行失败: {e}")
raise