第一次提交。
其中爬取是tophub_scraper.py 数据入库是 tophub_add_data_to_db.py 查看当前数据内容是 db_viewer.py
This commit is contained in:
101
db_modify_zhipu.py
Normal file
101
db_modify_zhipu.py
Normal file
@@ -0,0 +1,101 @@
|
||||
# 调用智谱的api,修改每一个项目的分类
|
||||
# 从db文件读取表,读取第二个项,标题,根据标题,提交到api,获取回复,返回,并更新到db文件
|
||||
|
||||
import sqlite3
|
||||
import time
|
||||
from loguru import logger
|
||||
from zhipuai import ZhipuAI
|
||||
|
||||
# 配置日志
|
||||
logger.add("db_modify_zhipu.log", rotation="10 MB", level="INFO")
|
||||
|
||||
# 初始化客户端
|
||||
client = ZhipuAI(api_key="fad3d9f9a45f4d939f0e7a7133fa07bf.X4bOO053GAIPKLE5")
|
||||
|
||||
def get_simplified_category(title):
|
||||
"""
|
||||
调用智谱API获取简化的分类名称
|
||||
"""
|
||||
try:
|
||||
# 创建聊天完成请求
|
||||
response = client.chat.completions.create(
|
||||
model="glm-4-flash",
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "你是一个专业的分类助手。请根据文章标题,提供一个3-6个汉字的简化分类名称,去除空格和特殊符号,更容易理解,并保持原意。"
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"对以下文字内容进行分类,返回结果为类别,如\"社会新闻\",\"机器人\",\"金融\",\"历史\",\"购物\",\"新质生产力\"等等。目的:只返回2-6个汉字,不返回其它内容。内容:'{title}'"
|
||||
}
|
||||
],
|
||||
temperature=0.7
|
||||
)
|
||||
|
||||
# 提取回复内容
|
||||
category = response.choices[0].message.content.strip()
|
||||
logger.info(f"标题: {title[:30]}... -> 分类: {category}")
|
||||
return category
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"获取分类失败: {str(e)}")
|
||||
return None
|
||||
|
||||
def update_database_categories():
|
||||
"""
|
||||
更新数据库中的分类信息
|
||||
"""
|
||||
# 连接到数据库
|
||||
conn = sqlite3.connect('tophub_data.db')
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
# 获取所有记录
|
||||
cursor.execute("SELECT id, title, category FROM articles")
|
||||
records = cursor.fetchall()
|
||||
|
||||
logger.info(f"共找到 {len(records)} 条记录需要处理")
|
||||
|
||||
updated_count = 0
|
||||
failed_count = 0
|
||||
|
||||
# 处理每条记录
|
||||
for record in records:
|
||||
record_id, title, current_category = record
|
||||
|
||||
# 跳过已经简化的分类(长度<=6且不包含特殊字符)
|
||||
if current_category and len(current_category) <= 6 and not any(c in current_category for c in " ,.!?;:,。!?;:"):
|
||||
logger.info(f"跳过记录 {record_id},分类已简化: {current_category}")
|
||||
continue
|
||||
|
||||
logger.info(f"处理记录 {record_id}: {title[:30]}...")
|
||||
|
||||
# 获取新的分类
|
||||
new_category = get_simplified_category(title)
|
||||
|
||||
if new_category:
|
||||
# 更新数据库
|
||||
cursor.execute("UPDATE articles SET category = ? WHERE id = ?", (new_category, record_id))
|
||||
conn.commit()
|
||||
updated_count += 1
|
||||
logger.info(f"已更新记录 {record_id} 的分类为: {new_category}")
|
||||
else:
|
||||
failed_count += 1
|
||||
logger.error(f"无法获取记录 {record_id} 的新分类")
|
||||
|
||||
# 添加延迟,避免API调用过于频繁
|
||||
time.sleep(1)
|
||||
|
||||
logger.info(f"处理完成! 成功更新 {updated_count} 条记录,失败 {failed_count} 条记录")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"更新数据库时出错: {str(e)}")
|
||||
conn.rollback()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
logger.info("开始更新数据库分类...")
|
||||
update_database_categories()
|
||||
logger.info("程序执行完成")
|
||||
Reference in New Issue
Block a user