第一次提交。

其中爬取是tophub_scraper.py 数据入库是 tophub_add_data_to_db.py 查看当前数据内容是 db_viewer.py
2025-11-09 17:20:44 +08:00
commit 25da264413
29 changed files with 28508 additions and 0 deletions
--- a/tophub_add_data_to_db.py
+++ b/tophub_add_data_to_db.py
@@ -0,0 +1,213 @@
+#!/usr/bin/env python3
+"""
+处理临时文件并写入数据库的脚本
+读取指定格式的临时文件，提取标题和链接，调用API进行分类，然后写入SQLite数据库
+"""
+
+import sqlite3
+import requests
+import os
+import re
+from datetime import datetime
+from tqdm import tqdm
+from loguru import logger
+import glob
+
+# 配置日志
+logger.add("tophub_add_data_to_db.log", rotation="10 MB", level="INFO")
+
+# API配置
+API_URL = "http://localhost:11434/api/generate"
+API_MODEL = "gemma3:4b"
+
+def init_database():
+    """初始化数据库，创建表结构"""
+    conn = sqlite3.connect('tophub_data.db')
+    cursor = conn.cursor()
+    
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS articles (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            title TEXT NOT NULL,
+            url TEXT NOT NULL,
+            category TEXT,
+            source_date TEXT NOT NULL,
+            created_at TEXT NOT NULL,
+            UNIQUE(title, source_date)
+        )
+    ''')
+    
+    conn.commit()
+    conn.close()
+    logger.info("数据库初始化完成")
+
+def find_temp_files():
+    """查找符合格式的临时文件"""
+    pattern = "*年*月*日*.txt"
+    files = glob.glob(pattern)
+    logger.info(f"找到 {len(files)} 个临时文件: {files}")
+    return files
+
+def parse_file_content(file_path):
+    """解析文件内容，按5行一个循环提取数据"""
+    articles = []
+    
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            lines = f.readlines()
+        
+        # 按5行一组进行解析
+        for i in range(0, len(lines), 5):
+            if i + 4 < len(lines):
+                node_id = lines[i].strip()
+                category = lines[i+1].strip()
+                title = lines[i+2].strip()
+                url = lines[i+3].strip()
+                separator = lines[i+4].strip() if i+4 < len(lines) else ""
+                
+                # 提取关键信息
+                title_match = re.search(r'标题: (.+)', title)
+                url_match = re.search(r'链接: (.+)', url)
+                
+                if title_match and url_match:
+                    articles.append({
+                        'title': title_match.group(1),
+                        'url': url_match.group(1),
+                        'category': category.split(': ')[1] if ': ' in category else '未知'
+                    })
+        
+        logger.info(f"从文件 {file_path} 解析出 {len(articles)} 条数据")
+        return articles
+        
+    except Exception as e:
+        logger.error(f"解析文件 {file_path} 失败: {e}")
+        return []
+
+def check_duplicate(title, date_str):
+    """检查标题+日期是否已存在"""
+    conn = sqlite3.connect('tophub_data.db')
+    cursor = conn.cursor()
+    
+    cursor.execute('''
+        SELECT COUNT(*) FROM articles 
+        WHERE title = ? AND source_date = ?
+    ''', (title, date_str))
+    
+    count = cursor.fetchone()[0]
+    conn.close()
+    
+    return count > 0
+
+def classify_title(title):
+    """调用API对标题进行分类"""
+    try:
+        prompt = f"目标：对以下文字内容进行分类，返回结果为类别，如\"社会新闻\"，\"金融\"，\"历史\"，\"购物\"，\"新质科技\"等等。目的：只返回2-4个字，不返回其它内容。内容：{title}"
+        
+        data = {
+            "model": API_MODEL,
+            "prompt": prompt,
+            "stream": False
+        }
+        
+        response = requests.post(API_URL, json=data, timeout=30)
+        response.raise_for_status()
+        
+        result = response.json()
+        category = result.get('response', '').strip()
+        
+        # 验证分类结果长度
+        if len(category) < 2 or len(category) > 8:
+            category = '其他'
+            
+        logger.info(f"标题 '{title}' 分类为: {category}")
+        return category
+        
+    except Exception as e:
+        logger.error(f"API调用失败，标题 '{title}': {e}")
+        return '其他'
+
+def insert_article(title, url, category, source_date):
+    """插入文章到数据库"""
+    conn = sqlite3.connect('tophub_data.db')
+    cursor = conn.cursor()
+    
+    try:
+        created_at = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+        cursor.execute('''
+            INSERT INTO articles (title, url, category, source_date, created_at)
+            VALUES (?, ?, ?, ?, ?)
+        ''', (title, url, category, source_date, created_at))
+        
+        conn.commit()
+        logger.info(f"成功插入文章: {title}")
+        return True
+        
+    except sqlite3.IntegrityError:
+        logger.warning(f"文章已存在，跳过: {title}")
+        return False
+        
+    except Exception as e:
+        logger.error(f"插入文章失败: {e}")
+        return False
+        
+    finally:
+        conn.close()
+
+def process_temp_files():
+    """主处理函数"""
+    logger.info("开始处理临时文件...")
+    
+    # 初始化数据库
+    init_database()
+    
+    # 查找临时文件
+    temp_files = find_temp_files()
+    
+    if not temp_files:
+        logger.warning("未找到临时文件")
+        return
+    
+    total_processed = 0
+    total_inserted = 0
+    
+    # 处理每个文件
+    for file_path in temp_files:
+        logger.info(f"处理文件: {file_path}")
+        
+        # 从文件名提取日期
+        date_match = re.search(r'(\d{4})年(\d{1,2})月(\d{1,2})日', file_path)
+        if date_match:
+            source_date = f"{date_match.group(1)}-{int(date_match.group(2)):02d}-{int(date_match.group(3)):02d}"
+        else:
+            source_date = datetime.now().strftime('%Y-%m-%d')
+        
+        # 解析文件内容
+        articles = parse_file_content(file_path)
+        
+        if not articles:
+            continue
+        
+        # 处理每篇文章
+        for article in tqdm(articles, desc=f"处理 {file_path}"):
+            total_processed += 1
+            
+            # 检查重复
+            if check_duplicate(article['title'], source_date):
+                logger.info(f"跳过重复文章: {article['title']}")
+                continue
+            
+            # 分类标题
+            category = classify_title(article['title'])
+            
+            # 插入数据库
+            if insert_article(article['title'], article['url'], category, source_date):
+                total_inserted += 1
+    
+    logger.info(f"处理完成! 总计处理: {total_processed}, 成功插入: {total_inserted}")
+
+if __name__ == "__main__":
+    try:
+        process_temp_files()
+    except Exception as e:
+        logger.error(f"程序执行失败: {e}")
+        raise