第一次提交。
其中爬取是tophub_scraper.py 数据入库是 tophub_add_data_to_db.py 查看当前数据内容是 db_viewer.py
This commit is contained in:
213
tophub_add_data_to_db.py
Normal file
213
tophub_add_data_to_db.py
Normal file
@@ -0,0 +1,213 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
处理临时文件并写入数据库的脚本
|
||||
读取指定格式的临时文件,提取标题和链接,调用API进行分类,然后写入SQLite数据库
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
import requests
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime
|
||||
from tqdm import tqdm
|
||||
from loguru import logger
|
||||
import glob
|
||||
|
||||
# 配置日志
|
||||
logger.add("tophub_add_data_to_db.log", rotation="10 MB", level="INFO")
|
||||
|
||||
# API配置
|
||||
API_URL = "http://localhost:11434/api/generate"
|
||||
API_MODEL = "gemma3:4b"
|
||||
|
||||
def init_database():
|
||||
"""初始化数据库,创建表结构"""
|
||||
conn = sqlite3.connect('tophub_data.db')
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute('''
|
||||
CREATE TABLE IF NOT EXISTS articles (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
title TEXT NOT NULL,
|
||||
url TEXT NOT NULL,
|
||||
category TEXT,
|
||||
source_date TEXT NOT NULL,
|
||||
created_at TEXT NOT NULL,
|
||||
UNIQUE(title, source_date)
|
||||
)
|
||||
''')
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
logger.info("数据库初始化完成")
|
||||
|
||||
def find_temp_files():
|
||||
"""查找符合格式的临时文件"""
|
||||
pattern = "*年*月*日*.txt"
|
||||
files = glob.glob(pattern)
|
||||
logger.info(f"找到 {len(files)} 个临时文件: {files}")
|
||||
return files
|
||||
|
||||
def parse_file_content(file_path):
|
||||
"""解析文件内容,按5行一个循环提取数据"""
|
||||
articles = []
|
||||
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
lines = f.readlines()
|
||||
|
||||
# 按5行一组进行解析
|
||||
for i in range(0, len(lines), 5):
|
||||
if i + 4 < len(lines):
|
||||
node_id = lines[i].strip()
|
||||
category = lines[i+1].strip()
|
||||
title = lines[i+2].strip()
|
||||
url = lines[i+3].strip()
|
||||
separator = lines[i+4].strip() if i+4 < len(lines) else ""
|
||||
|
||||
# 提取关键信息
|
||||
title_match = re.search(r'标题: (.+)', title)
|
||||
url_match = re.search(r'链接: (.+)', url)
|
||||
|
||||
if title_match and url_match:
|
||||
articles.append({
|
||||
'title': title_match.group(1),
|
||||
'url': url_match.group(1),
|
||||
'category': category.split(': ')[1] if ': ' in category else '未知'
|
||||
})
|
||||
|
||||
logger.info(f"从文件 {file_path} 解析出 {len(articles)} 条数据")
|
||||
return articles
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"解析文件 {file_path} 失败: {e}")
|
||||
return []
|
||||
|
||||
def check_duplicate(title, date_str):
|
||||
"""检查标题+日期是否已存在"""
|
||||
conn = sqlite3.connect('tophub_data.db')
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute('''
|
||||
SELECT COUNT(*) FROM articles
|
||||
WHERE title = ? AND source_date = ?
|
||||
''', (title, date_str))
|
||||
|
||||
count = cursor.fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
return count > 0
|
||||
|
||||
def classify_title(title):
|
||||
"""调用API对标题进行分类"""
|
||||
try:
|
||||
prompt = f"目标:对以下文字内容进行分类,返回结果为类别,如\"社会新闻\",\"金融\",\"历史\",\"购物\",\"新质科技\"等等。目的:只返回2-4个字,不返回其它内容。内容:{title}"
|
||||
|
||||
data = {
|
||||
"model": API_MODEL,
|
||||
"prompt": prompt,
|
||||
"stream": False
|
||||
}
|
||||
|
||||
response = requests.post(API_URL, json=data, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
category = result.get('response', '').strip()
|
||||
|
||||
# 验证分类结果长度
|
||||
if len(category) < 2 or len(category) > 8:
|
||||
category = '其他'
|
||||
|
||||
logger.info(f"标题 '{title}' 分类为: {category}")
|
||||
return category
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"API调用失败,标题 '{title}': {e}")
|
||||
return '其他'
|
||||
|
||||
def insert_article(title, url, category, source_date):
|
||||
"""插入文章到数据库"""
|
||||
conn = sqlite3.connect('tophub_data.db')
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
created_at = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||||
cursor.execute('''
|
||||
INSERT INTO articles (title, url, category, source_date, created_at)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
''', (title, url, category, source_date, created_at))
|
||||
|
||||
conn.commit()
|
||||
logger.info(f"成功插入文章: {title}")
|
||||
return True
|
||||
|
||||
except sqlite3.IntegrityError:
|
||||
logger.warning(f"文章已存在,跳过: {title}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"插入文章失败: {e}")
|
||||
return False
|
||||
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def process_temp_files():
|
||||
"""主处理函数"""
|
||||
logger.info("开始处理临时文件...")
|
||||
|
||||
# 初始化数据库
|
||||
init_database()
|
||||
|
||||
# 查找临时文件
|
||||
temp_files = find_temp_files()
|
||||
|
||||
if not temp_files:
|
||||
logger.warning("未找到临时文件")
|
||||
return
|
||||
|
||||
total_processed = 0
|
||||
total_inserted = 0
|
||||
|
||||
# 处理每个文件
|
||||
for file_path in temp_files:
|
||||
logger.info(f"处理文件: {file_path}")
|
||||
|
||||
# 从文件名提取日期
|
||||
date_match = re.search(r'(\d{4})年(\d{1,2})月(\d{1,2})日', file_path)
|
||||
if date_match:
|
||||
source_date = f"{date_match.group(1)}-{int(date_match.group(2)):02d}-{int(date_match.group(3)):02d}"
|
||||
else:
|
||||
source_date = datetime.now().strftime('%Y-%m-%d')
|
||||
|
||||
# 解析文件内容
|
||||
articles = parse_file_content(file_path)
|
||||
|
||||
if not articles:
|
||||
continue
|
||||
|
||||
# 处理每篇文章
|
||||
for article in tqdm(articles, desc=f"处理 {file_path}"):
|
||||
total_processed += 1
|
||||
|
||||
# 检查重复
|
||||
if check_duplicate(article['title'], source_date):
|
||||
logger.info(f"跳过重复文章: {article['title']}")
|
||||
continue
|
||||
|
||||
# 分类标题
|
||||
category = classify_title(article['title'])
|
||||
|
||||
# 插入数据库
|
||||
if insert_article(article['title'], article['url'], category, source_date):
|
||||
total_inserted += 1
|
||||
|
||||
logger.info(f"处理完成! 总计处理: {total_processed}, 成功插入: {total_inserted}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
process_temp_files()
|
||||
except Exception as e:
|
||||
logger.error(f"程序执行失败: {e}")
|
||||
raise
|
||||
Reference in New Issue
Block a user