第一次提交。
其中爬取是tophub_scraper.py 数据入库是 tophub_add_data_to_db.py 查看当前数据内容是 db_viewer.py
This commit is contained in:
220
db_modify.py
Normal file
220
db_modify.py
Normal file
@@ -0,0 +1,220 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
打开tophub_data.db数据库,读取表单,提取所有的类
|
||||
访问本地ollama的api,修改类的名称为2-4个字,去掉中间的空格、特殊字符等字符
|
||||
|
||||
"""
|
||||
|
||||
import requests
|
||||
import sqlite3
|
||||
import re
|
||||
import time
|
||||
from loguru import logger
|
||||
|
||||
# 配置日志
|
||||
logger.add("db_modify.log", rotation="10 MB", level="INFO")
|
||||
|
||||
class CategoryModifier:
|
||||
"""类别修改器,用于优化数据库中的类别名称"""
|
||||
|
||||
def __init__(self, db_path="tophub_data.db"):
|
||||
"""
|
||||
初始化类别修改器
|
||||
|
||||
Args:
|
||||
db_path (str): 数据库路径
|
||||
"""
|
||||
self.db_path = db_path
|
||||
self.ollama_url = "http://localhost:11434/api/generate"
|
||||
self.model = "qwen3:8b"
|
||||
|
||||
def get_all_categories(self):
|
||||
"""
|
||||
从数据库中获取所有唯一的类别
|
||||
|
||||
Returns:
|
||||
list: 包含所有唯一类别的列表
|
||||
"""
|
||||
try:
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("SELECT DISTINCT category FROM articles")
|
||||
categories = [row[0] for row in cursor.fetchall() if row[0]]
|
||||
|
||||
conn.close()
|
||||
logger.info(f"成功获取 {len(categories)} 个唯一类别")
|
||||
return categories
|
||||
except Exception as e:
|
||||
logger.error(f"获取类别时出错: {e}")
|
||||
return []
|
||||
|
||||
def clean_category_name(self, category):
|
||||
"""
|
||||
清理类别名称,移除特殊字符和多余空格
|
||||
|
||||
Args:
|
||||
category (str): 原始类别名称
|
||||
|
||||
Returns:
|
||||
str: 清理后的类别名称
|
||||
"""
|
||||
# 移除特殊字符,只保留中文、英文和数字
|
||||
cleaned = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9]', '', category)
|
||||
# 移除多余的空格
|
||||
cleaned = re.sub(r'\s+', '', cleaned)
|
||||
return cleaned
|
||||
|
||||
def optimize_category_with_ollama(self, category):
|
||||
"""
|
||||
使用Ollama API优化类别名称
|
||||
|
||||
Args:
|
||||
category (str): 原始类别名称
|
||||
|
||||
Returns:
|
||||
str: 优化后的类别名称
|
||||
"""
|
||||
try:
|
||||
# 构造提示词
|
||||
prompt = f"请将以下类别名称简化为3-6个汉字,去除空格和特殊符号,更容易理解,并保持原意:'{category}'。" + \
|
||||
"例子一:'新科科技',优化为'新质生产力'。例子二:'产设',优化为'产品设计'。例子三:'史人',优化为'历史人物'。"
|
||||
|
||||
# 准备请求数据
|
||||
data = {
|
||||
"model": self.model,
|
||||
"prompt": prompt,
|
||||
"stream": False
|
||||
}
|
||||
|
||||
# 发送请求到Ollama API
|
||||
response = requests.post(self.ollama_url, json=data, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
# 解析响应
|
||||
result = response.json()
|
||||
optimized = result.get("response", "").strip()
|
||||
|
||||
# 清理优化后的名称
|
||||
optimized = self.clean_category_name(optimized)
|
||||
|
||||
logger.info(f"类别 '{category}' 优化为 '{optimized}'")
|
||||
return optimized
|
||||
except Exception as e:
|
||||
logger.error(f"优化类别 '{category}' 时出错: {e}")
|
||||
# 如果API调用失败,返回清理后的原始名称
|
||||
return self.clean_category_name(category)
|
||||
|
||||
def update_category_in_db(self, old_category, new_category):
|
||||
"""
|
||||
更新数据库中的类别名称
|
||||
|
||||
Args:
|
||||
old_category (str): 原始类别名称
|
||||
new_category (str): 新的类别名称
|
||||
"""
|
||||
try:
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute(
|
||||
"UPDATE articles SET category = ? WHERE category = ?",
|
||||
(new_category, old_category)
|
||||
)
|
||||
|
||||
count = cursor.rowcount
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
logger.info(f"成功更新类别 '{old_category}' 为 '{new_category}',影响 {count} 条记录")
|
||||
except Exception as e:
|
||||
logger.error(f"更新类别 '{old_category}' 时出错: {e}")
|
||||
|
||||
def process_all_categories(self):
|
||||
"""
|
||||
处理所有类别
|
||||
"""
|
||||
logger.info("开始处理所有类别...")
|
||||
|
||||
# 获取所有类别
|
||||
categories = self.get_all_categories()
|
||||
|
||||
if not categories:
|
||||
logger.warning("未找到任何类别")
|
||||
return
|
||||
|
||||
# 初始化进度统计
|
||||
total_categories = len(categories)
|
||||
processed_count = 0
|
||||
unchanged_count = 0
|
||||
updated_count = 0
|
||||
start_time = time.time()
|
||||
|
||||
logger.info(f"总共需要处理 {total_categories} 个类别")
|
||||
|
||||
# 处理每个类别
|
||||
for i, category in enumerate(categories, 1):
|
||||
category_start_time = time.time()
|
||||
logger.info(f"处理进度: {i}/{total_categories} ({i/total_categories*100:.1f}%) - 类别: {category}")
|
||||
|
||||
# 使用Ollama API优化类别名称
|
||||
optimized_category = self.optimize_category_with_ollama(category)
|
||||
|
||||
# 如果优化后的名称与原始名称不同,则更新数据库
|
||||
if optimized_category != category:
|
||||
self.update_category_in_db(category, optimized_category)
|
||||
updated_count += 1
|
||||
logger.info(f"类别 '{category}' 已更新为 '{optimized_category}'")
|
||||
else:
|
||||
unchanged_count += 1
|
||||
logger.info(f"类别 '{category}' 无需更改")
|
||||
|
||||
processed_count += 1
|
||||
category_end_time = time.time()
|
||||
category_duration = category_end_time - category_start_time
|
||||
|
||||
# 显示当前类别处理时间和平均处理时间
|
||||
elapsed_time = time.time() - start_time
|
||||
avg_time_per_category = elapsed_time / processed_count
|
||||
estimated_remaining = avg_time_per_category * (total_categories - processed_count)
|
||||
|
||||
logger.info(f"类别 '{category}' 处理完成,耗时: {category_duration:.2f}秒")
|
||||
logger.info(f"累计处理: {processed_count}/{total_categories} | "
|
||||
f"已更新: {updated_count} | 未更改: {unchanged_count} | "
|
||||
f"平均耗时: {avg_time_per_category:.2f}秒/类别 | "
|
||||
f"预计剩余时间: {estimated_remaining:.2f}秒")
|
||||
|
||||
# 显示总体统计信息
|
||||
total_duration = time.time() - start_time
|
||||
logger.info("="*60)
|
||||
logger.info("所有类别处理完成!")
|
||||
logger.info(f"总计处理类别数: {total_categories}")
|
||||
logger.info(f"更新类别数: {updated_count}")
|
||||
logger.info(f"未更改类别数: {unchanged_count}")
|
||||
logger.info(f"总耗时: {total_duration:.2f}秒")
|
||||
logger.info(f"平均每类别处理时间: {total_duration/total_categories:.2f}秒")
|
||||
logger.info("="*60)
|
||||
|
||||
def main():
|
||||
"""主函数"""
|
||||
modifier = CategoryModifier()
|
||||
|
||||
# 检查Ollama服务是否可用
|
||||
try:
|
||||
response = requests.get("http://localhost:11434/api/tags", timeout=5)
|
||||
if response.status_code == 200:
|
||||
logger.info("Ollama服务可用")
|
||||
else:
|
||||
logger.warning("Ollama服务不可用,请确保服务已启动")
|
||||
return
|
||||
except Exception as e:
|
||||
logger.warning(f"无法连接到Ollama服务: {e}")
|
||||
logger.info("请确保Ollama服务已在本地运行")
|
||||
return
|
||||
|
||||
# 处理所有类别
|
||||
modifier.process_all_categories()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user