Files
tophux_scrape/db_modify_zhipu.py
xiaji 25da264413 第一次提交。
其中爬取是tophub_scraper.py
数据入库是 tophub_add_data_to_db.py
查看当前数据内容是 db_viewer.py
2025-11-09 17:20:44 +08:00

102 lines
3.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# 调用智谱的api修改每一个项目的分类
# 从db文件读取表读取第二个项标题根据标题提交到api获取回复返回并更新到db文件
import sqlite3
import time
from loguru import logger
from zhipuai import ZhipuAI
# 配置日志
logger.add("db_modify_zhipu.log", rotation="10 MB", level="INFO")
# 初始化客户端
client = ZhipuAI(api_key="fad3d9f9a45f4d939f0e7a7133fa07bf.X4bOO053GAIPKLE5")
def get_simplified_category(title):
"""
调用智谱API获取简化的分类名称
"""
try:
# 创建聊天完成请求
response = client.chat.completions.create(
model="glm-4-flash",
messages=[
{
"role": "system",
"content": "你是一个专业的分类助手。请根据文章标题提供一个3-6个汉字的简化分类名称去除空格和特殊符号更容易理解并保持原意。"
},
{
"role": "user",
"content": f"对以下文字内容进行分类,返回结果为类别,如\"社会新闻\"\"机器人\"\"金融\"\"历史\"\"购物\"\"新质生产力\"等等。目的只返回2-6个汉字不返回其它内容。内容'{title}'"
}
],
temperature=0.7
)
# 提取回复内容
category = response.choices[0].message.content.strip()
logger.info(f"标题: {title[:30]}... -> 分类: {category}")
return category
except Exception as e:
logger.error(f"获取分类失败: {str(e)}")
return None
def update_database_categories():
"""
更新数据库中的分类信息
"""
# 连接到数据库
conn = sqlite3.connect('tophub_data.db')
cursor = conn.cursor()
try:
# 获取所有记录
cursor.execute("SELECT id, title, category FROM articles")
records = cursor.fetchall()
logger.info(f"共找到 {len(records)} 条记录需要处理")
updated_count = 0
failed_count = 0
# 处理每条记录
for record in records:
record_id, title, current_category = record
# 跳过已经简化的分类(长度<=6且不包含特殊字符
if current_category and len(current_category) <= 6 and not any(c in current_category for c in " ,.!?;:,。!?;:"):
logger.info(f"跳过记录 {record_id},分类已简化: {current_category}")
continue
logger.info(f"处理记录 {record_id}: {title[:30]}...")
# 获取新的分类
new_category = get_simplified_category(title)
if new_category:
# 更新数据库
cursor.execute("UPDATE articles SET category = ? WHERE id = ?", (new_category, record_id))
conn.commit()
updated_count += 1
logger.info(f"已更新记录 {record_id} 的分类为: {new_category}")
else:
failed_count += 1
logger.error(f"无法获取记录 {record_id} 的新分类")
# 添加延迟避免API调用过于频繁
time.sleep(1)
logger.info(f"处理完成! 成功更新 {updated_count} 条记录,失败 {failed_count} 条记录")
except Exception as e:
logger.error(f"更新数据库时出错: {str(e)}")
conn.rollback()
finally:
conn.close()
if __name__ == "__main__":
logger.info("开始更新数据库分类...")
update_database_categories()
logger.info("程序执行完成")