Files
tophux_scrape/tophub_add_data_to_db.py
xiaji 25da264413 第一次提交。
其中爬取是tophub_scraper.py
数据入库是 tophub_add_data_to_db.py
查看当前数据内容是 db_viewer.py
2025-11-09 17:20:44 +08:00

213 lines
6.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
处理临时文件并写入数据库的脚本
读取指定格式的临时文件提取标题和链接调用API进行分类然后写入SQLite数据库
"""
import sqlite3
import requests
import os
import re
from datetime import datetime
from tqdm import tqdm
from loguru import logger
import glob
# 配置日志
logger.add("tophub_add_data_to_db.log", rotation="10 MB", level="INFO")
# API配置
API_URL = "http://localhost:11434/api/generate"
API_MODEL = "gemma3:4b"
def init_database():
"""初始化数据库,创建表结构"""
conn = sqlite3.connect('tophub_data.db')
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS articles (
id INTEGER PRIMARY KEY AUTOINCREMENT,
title TEXT NOT NULL,
url TEXT NOT NULL,
category TEXT,
source_date TEXT NOT NULL,
created_at TEXT NOT NULL,
UNIQUE(title, source_date)
)
''')
conn.commit()
conn.close()
logger.info("数据库初始化完成")
def find_temp_files():
"""查找符合格式的临时文件"""
pattern = "*年*月*日*.txt"
files = glob.glob(pattern)
logger.info(f"找到 {len(files)} 个临时文件: {files}")
return files
def parse_file_content(file_path):
"""解析文件内容按5行一个循环提取数据"""
articles = []
try:
with open(file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
# 按5行一组进行解析
for i in range(0, len(lines), 5):
if i + 4 < len(lines):
node_id = lines[i].strip()
category = lines[i+1].strip()
title = lines[i+2].strip()
url = lines[i+3].strip()
separator = lines[i+4].strip() if i+4 < len(lines) else ""
# 提取关键信息
title_match = re.search(r'标题: (.+)', title)
url_match = re.search(r'链接: (.+)', url)
if title_match and url_match:
articles.append({
'title': title_match.group(1),
'url': url_match.group(1),
'category': category.split(': ')[1] if ': ' in category else '未知'
})
logger.info(f"从文件 {file_path} 解析出 {len(articles)} 条数据")
return articles
except Exception as e:
logger.error(f"解析文件 {file_path} 失败: {e}")
return []
def check_duplicate(title, date_str):
"""检查标题+日期是否已存在"""
conn = sqlite3.connect('tophub_data.db')
cursor = conn.cursor()
cursor.execute('''
SELECT COUNT(*) FROM articles
WHERE title = ? AND source_date = ?
''', (title, date_str))
count = cursor.fetchone()[0]
conn.close()
return count > 0
def classify_title(title):
"""调用API对标题进行分类"""
try:
prompt = f"目标:对以下文字内容进行分类,返回结果为类别,如\"社会新闻\"\"金融\"\"历史\"\"购物\"\"新质科技\"等等。目的只返回2-4个字不返回其它内容。内容{title}"
data = {
"model": API_MODEL,
"prompt": prompt,
"stream": False
}
response = requests.post(API_URL, json=data, timeout=30)
response.raise_for_status()
result = response.json()
category = result.get('response', '').strip()
# 验证分类结果长度
if len(category) < 2 or len(category) > 8:
category = '其他'
logger.info(f"标题 '{title}' 分类为: {category}")
return category
except Exception as e:
logger.error(f"API调用失败标题 '{title}': {e}")
return '其他'
def insert_article(title, url, category, source_date):
"""插入文章到数据库"""
conn = sqlite3.connect('tophub_data.db')
cursor = conn.cursor()
try:
created_at = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
cursor.execute('''
INSERT INTO articles (title, url, category, source_date, created_at)
VALUES (?, ?, ?, ?, ?)
''', (title, url, category, source_date, created_at))
conn.commit()
logger.info(f"成功插入文章: {title}")
return True
except sqlite3.IntegrityError:
logger.warning(f"文章已存在,跳过: {title}")
return False
except Exception as e:
logger.error(f"插入文章失败: {e}")
return False
finally:
conn.close()
def process_temp_files():
"""主处理函数"""
logger.info("开始处理临时文件...")
# 初始化数据库
init_database()
# 查找临时文件
temp_files = find_temp_files()
if not temp_files:
logger.warning("未找到临时文件")
return
total_processed = 0
total_inserted = 0
# 处理每个文件
for file_path in temp_files:
logger.info(f"处理文件: {file_path}")
# 从文件名提取日期
date_match = re.search(r'(\d{4})年(\d{1,2})月(\d{1,2})日', file_path)
if date_match:
source_date = f"{date_match.group(1)}-{int(date_match.group(2)):02d}-{int(date_match.group(3)):02d}"
else:
source_date = datetime.now().strftime('%Y-%m-%d')
# 解析文件内容
articles = parse_file_content(file_path)
if not articles:
continue
# 处理每篇文章
for article in tqdm(articles, desc=f"处理 {file_path}"):
total_processed += 1
# 检查重复
if check_duplicate(article['title'], source_date):
logger.info(f"跳过重复文章: {article['title']}")
continue
# 分类标题
category = classify_title(article['title'])
# 插入数据库
if insert_article(article['title'], article['url'], category, source_date):
total_inserted += 1
logger.info(f"处理完成! 总计处理: {total_processed}, 成功插入: {total_inserted}")
if __name__ == "__main__":
try:
process_temp_files()
except Exception as e:
logger.error(f"程序执行失败: {e}")
raise