增加follow字段

This commit is contained in:
2025-12-03 19:51:12 +08:00
parent 389486ad6e
commit 9e20d439bf
6 changed files with 6818 additions and 21 deletions

View File

@@ -0,0 +1,42 @@
## 实现计划
### 1. 数据库结构更新
- **修改`init_database`方法**:在`product_analysis`表中添加`follows`字段,用于存储转换后的用户关注数
### 2. 添加用户关注数转换方法
- **创建`convert_user_count_to_number`方法**使用Ollama API将`user_count`文本转换为数字
- 处理不同格式:"53 followers" → 53"1.9K followers" → 1900
- 调用Ollama API进行智能转换
- 返回转换后的数字
### 3. 集成到现有分析流程
- **修改`get_product_data`方法**:在查询中包含`user_count``url`字段
- **更新`analyze_products`方法**
- 扩展返回值处理,包含`user_count``url`
- 在分析过程中调用转换方法处理关注数
- 将转换后的数字传递给保存方法
### 4. 更新数据保存方法
- **修改`save_analysis_result`方法**:添加`follows`参数,将转换后的关注数保存到数据库
### 5. 添加关注数分析更新功能
- **创建`analyze_follower_counts`方法**
- 查询所有产品及其分析记录
- 对每个产品转换`user_count`并更新`product_analysis.follows`
- 处理已有分析记录的关注数更新
### 6. 完善工作流程
- **更新`run_full_workflow_async`方法**添加第4步执行关注数分析更新
## 预期效果
- 新的`product_analysis`表将包含`follows`字段,存储转换后的数字关注数
- 新分析的产品将自动转换并保存关注数
- 已有产品将通过额外步骤更新关注数
- 使用Ollama API确保转换准确性
## 关键技术点
- SQLite数据库表结构修改
- Ollama API调用与结果解析
- 文本到数字的智能转换
- 现有代码的无缝集成
- 批量数据处理与更新

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,118 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
填补product_analysis表中follows字段内容的脚本
用于将products表中的user_count转换为数字并更新到product_analysis.follows字段
"""
import sqlite3
import os
import sys
from loguru import logger
# 配置日志
logger.remove()
logger.add(sys.stderr, level="INFO", format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>")
class FollowsFiller:
"""用于填补follows字段内容的类"""
def __init__(self, db_path):
self.db_path = db_path
self.api_url = "http://localhost:11434/api/generate"
def connect_to_database(self) -> sqlite3.Connection:
"""连接到SQLite数据库"""
try:
conn = sqlite3.connect(self.db_path)
logger.success(f"成功连接到数据库: {self.db_path}")
return conn
except Exception as e:
logger.error(f"连接数据库失败: {e}")
raise
def check_table_structure(self) -> bool:
"""检查数据库表结构是否正确"""
logger.info("正在检查数据库表结构...")
conn = self.connect_to_database()
cursor = conn.cursor()
try:
# 检查products表是否存在
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='products'")
if not cursor.fetchone():
logger.error("products表不存在")
return False
# 检查product_analysis表是否存在
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='product_analysis'")
if not cursor.fetchone():
logger.error("product_analysis表不存在")
return False
# 检查product_analysis表是否有follows字段
cursor.execute("PRAGMA table_info(product_analysis)")
columns = [col[1] for col in cursor.fetchall()]
if 'follows' not in columns:
logger.error("product_analysis表没有follows字段")
return False
logger.success("数据库表结构检查通过")
return True
finally:
conn.close()
def convert_user_count_to_number(self, user_count: str) -> int:
"""将user_count文本转换为数字
Args:
user_count: 用户数量文本,如"53 followers""1.9K followers"
Returns:
转换后的数字
"""
if not user_count or user_count.strip() == "":
logger.info(f"空的用户数量: {user_count}")
return None
try:
# 移除多余空格和"followers"等文本
import re
cleaned = re.sub(r'\s*followers?\s*$', '', user_count.strip(), flags=re.IGNORECASE)
# 处理K/M等单位
if cleaned.endswith('K') or cleaned.endswith('k'):
return int(float(cleaned[:-1]) * 1000)
elif cleaned.endswith('M') or cleaned.endswith('m'):
return int(float(cleaned[:-1]) * 1000000)
else:
# 直接转换为整数
return int(re.sub(r'[^\d]', '', cleaned))
except Exception as e:
logger.error(f"转换用户数量失败: {user_count}, 错误: {e}")
return None
def fill_follows_field(self):
"""填补product_analysis表中的follows字段内容"""
logger.info("=== 开始填补follows字段内容 ===")
conn = self.connect_to_database()
cursor = conn.cursor()
try:
# 查询所有产品及其对应的分析记录
cursor.execute("""
SELECT p.id, p.name, p.user_count, pa.id as analysis_id, pa.follows
FROM products p
LEFT JOIN product_analysis pa ON p.name = pa.original_name
WHERE pa.id IS NOT NULL
""")
products = cursor.fetchall()
logger.info(f"找到 {len(products)} 个产品及其分析记录")
if not products:
logger.info("没有发现需要填补follows字段的记录")
return

View File

@@ -112,10 +112,18 @@ class IntegratedProductSystem:
ai_response TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
difficulty_score INTEGER,
product_link TEXT
product_link TEXT,
follows INTEGER
)
''')
# 为现有表添加follows字段如果不存在
cursor.execute("PRAGMA table_info(product_analysis)")
columns = [col[1] for col in cursor.fetchall()]
if 'follows' not in columns:
cursor.execute("ALTER TABLE product_analysis ADD COLUMN follows INTEGER")
logger.info("已为product_analysis表添加follows字段")
conn.commit()
conn.close()
logger.success("产品数据库初始化完成")
@@ -262,9 +270,9 @@ class IntegratedProductSystem:
try:
cursor = conn.cursor()
# 查询products表中的nameintroduction字段
# 查询products表中的id、nameintroduction、user_count和url字段
cursor.execute("""
SELECT id, name, introduction
SELECT id, name, introduction, user_count, url
FROM products
WHERE name IS NOT NULL AND introduction IS NOT NULL
AND name != '' AND introduction != ''
@@ -274,8 +282,8 @@ class IntegratedProductSystem:
logger.info(f"从数据库获取到 {len(products)} 个产品")
# 显示前几个产品作为示例
for i, (id, name, intro) in enumerate(products[:3], 1):
logger.info(f"示例产品{i}: ID={id}, 名称='{name}', 简介='{intro[:50]}...'")
for i, (id, name, intro, user_count, url) in enumerate(products[:3], 1):
logger.info(f"示例产品{i}: ID={id}, 名称='{name}', 简介='{intro[:50]}...', 用户数='{user_count}', URL='{url}'")
return products
@@ -321,6 +329,64 @@ class IntegratedProductSystem:
logger.error(f"调用Ollama AI API时出错: {e}")
return None
def convert_user_count_to_number(self, user_count: str) -> Optional[int]:
"""使用Ollama API将user_count文本转换为数字
Args:
user_count: 用户数量文本,如"53 followers""1.9K followers"
Returns:
转换后的数字或None如果转换失败
"""
if not user_count or user_count.strip() == "":
logger.info(f"空的用户数量: {user_count}")
return None
try:
logger.info(f"正在转换用户数量: {user_count}")
# 构建请求数据,专门用于用户数量转换
prompt = f"请将以下用户数量文本转换为纯数字,不要包含任何其他内容:\n{user_count}\n\n转换规则:\n- 直接数字:如'53 followers' → 53\n- K表示千'1.9K followers' → 1900\n- M表示百万'2.5M followers' → 2500000\n- 只返回数字,不要添加任何单位或解释"
data = {
"model": "qwen3:8b",
"prompt": prompt,
"stream": False
}
headers = {
"Content-Type": "application/json"
}
# 调用Ollama API
response = requests.post(
self.api_url,
headers=headers,
data=json.dumps(data, ensure_ascii=False),
timeout=30
)
if response.status_code == 200:
result = response.json()
converted = result.get("response", "").strip()
logger.success(f"成功转换用户数量: {user_count}{converted}")
# 提取纯数字
import re
number_match = re.search(r'\d+(?:\.\d+)?', converted)
if number_match:
return int(float(number_match.group()))
else:
logger.error(f"无法从转换结果中提取数字: {converted}")
return None
else:
logger.error(f"Ollama API调用失败: {response.status_code}, {response.text}")
return None
except Exception as e:
logger.error(f"转换用户数量时出错: {e}")
return None
def parse_ai_response(self, response: str) -> Tuple[str, str, str, int]:
"""解析AI响应内容提取产品名称、简介、难度描述和难度分数"""
try:
@@ -398,8 +464,8 @@ class IntegratedProductSystem:
def save_analysis_result(self, conn: sqlite3.Connection,
original_name: str, difficulty: str,
ai_response: str, difficulty_score: int = None,
product_link: str = None):
"""保存分析结果到数据库,包括难度分数产品链接"""
product_link: str = None, follows: int = None):
"""保存分析结果到数据库,包括难度分数产品链接和关注数"""
try:
cursor = conn.cursor()
@@ -409,12 +475,12 @@ class IntegratedProductSystem:
cursor.execute("""
INSERT INTO product_analysis
(original_name, development_difficulty, difficulty_score, ai_response, product_link)
VALUES (?, ?, ?, ?, ?)
""", (original_name, difficulty, difficulty_score, ai_response, product_link))
(original_name, development_difficulty, difficulty_score, ai_response, product_link, follows)
VALUES (?, ?, ?, ?, ?, ?)
""", (original_name, difficulty, difficulty_score, ai_response, product_link, follows))
conn.commit()
logger.success(f"保存分析结果成功: {original_name}, 难度分数: {difficulty_score}")
logger.success(f"保存分析结果成功: {original_name}, 难度分数: {difficulty_score}, 关注数: {follows}")
except Exception as e:
logger.error(f"保存分析结果失败: {e}")
@@ -450,7 +516,7 @@ class IntegratedProductSystem:
# 逐个分析产品
success_count = 0
skip_count = 0
for i, (original_id, name, introduction) in enumerate(products_to_analyze, 1):
for i, (original_id, name, introduction, user_count, url) in enumerate(products_to_analyze, 1):
logger.info(f"\n分析进度: {i}/{len(products_to_analyze)} - {name}")
# 检查产品是否已存在
@@ -462,7 +528,7 @@ class IntegratedProductSystem:
# 显示API调用状态
logger.info(f"正在提交API请求... 进度: {i}/{len(products_to_analyze)}")
# 调用AI API
# 调用AI API分析产品
ai_response = self.call_ollama_ai_api(name, introduction)
if ai_response:
@@ -472,8 +538,13 @@ class IntegratedProductSystem:
# 解析响应
product_intro, difficulty, difficulty_score = self.parse_ai_response(ai_response)
# 保存结果不再保存product_intro避免与ai_response重复
self.save_analysis_result(conn, name, difficulty, ai_response, difficulty_score)
# 转换用户关注数
follows = None
if user_count:
follows = self.convert_user_count_to_number(user_count)
# 保存结果
self.save_analysis_result(conn, name, difficulty, ai_response, difficulty_score, url, follows)
success_count += 1
# 显示完成状态
@@ -660,8 +731,71 @@ class IntegratedProductSystem:
conn.close()
logger.info("数据库连接已关闭")
def analyze_follower_counts(self):
"""分析并更新产品的关注数"""
logger.info("=== 开始分析产品关注数 ===")
conn = None
try:
# 连接数据库
conn = self.connect_to_database()
cursor = conn.cursor()
# 查询所有产品及其对应的分析记录
cursor.execute("""
SELECT p.id, p.name, p.user_count, pa.id as analysis_id
FROM products p
LEFT JOIN product_analysis pa ON p.name = pa.original_name
WHERE p.user_count IS NOT NULL AND p.user_count != ''
""")
products = cursor.fetchall()
logger.info(f"找到 {len(products)} 个需要更新关注数的产品")
if not products:
logger.info("没有发现需要更新关注数的产品")
return
# 为每个产品转换user_count并更新到product_analysis.follows
updated_count = 0
for i, (product_id, name, user_count, analysis_id) in enumerate(products, 1):
logger.info(f"处理产品关注数 {i}/{len(products)}: {name}, 用户数: {user_count}")
if not analysis_id:
logger.info(f"产品 '{name}' 没有对应的分析记录,跳过")
continue
# 转换用户关注数
follows = self.convert_user_count_to_number(user_count)
# 更新关注数
if follows is not None:
cursor.execute("""
UPDATE product_analysis
SET follows = ?
WHERE id = ?
""", (follows, analysis_id))
conn.commit()
updated_count += 1
logger.success(f"成功更新产品 '{name}' 的关注数为 {follows}")
else:
logger.warning(f"无法为产品 '{name}' 转换关注数")
# 避免API调用过于频繁
if i < len(products):
time.sleep(2)
logger.success(f"关注数分析完成! 成功更新 {updated_count} 个产品的关注数")
except Exception as e:
logger.error(f"分析关注数过程中出错: {e}")
finally:
if conn:
conn.close()
logger.info("数据库连接已关闭")
async def run_full_workflow_async(self, max_products=None, analyze_only=False):
"""异步运行完整工作流程:抓取+分析+补充缺失分数"""
"""异步运行完整工作流程:抓取+分析+补充缺失分数+更新关注数"""
logger.info("=== 开始全功能产品系统工作流程 ===")
# 初始化数据库
@@ -682,6 +816,10 @@ class IntegratedProductSystem:
logger.info("步骤3: 开始分析并补充缺失的难度分数...")
self.analyze_missing_scores()
# 步骤4: 分析并更新产品关注数
logger.info("步骤4: 开始分析并更新产品关注数...")
self.analyze_follower_counts()
logger.success("=== 全功能产品系统工作流程完成 ===")
def run_full_workflow(self, max_products=None, analyze_only=False):
@@ -723,11 +861,11 @@ async def main():
chrome_bat_path = os.path.join(os.path.dirname(__file__), "start_chrome.bat")
logger.info(f"正在运行Chrome启动脚本: {chrome_bat_path}")
try:
# 运行批处理程序,等待其完成
subprocess.run([chrome_bat_path], check=True, shell=True)
logger.success("Chrome启动脚本执行成功")
except subprocess.CalledProcessError as e:
logger.error(f"Chrome启动脚本执行失败: {e}")
# 异步运行批处理程序,等待其完成
subprocess.Popen([chrome_bat_path], shell=True)
logger.success("Chrome启动脚本已启动")
except Exception as e:
logger.error(f"Chrome启动脚本启动失败: {e}")
except FileNotFoundError:
logger.error(f"未找到Chrome启动脚本: {chrome_bat_path}")

Binary file not shown.

View File

@@ -0,0 +1,76 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
数据库更新脚本
用于更新现有的product_analysis表添加follows字段
"""
import sqlite3
import os
from loguru import logger
import sys
# 配置日志
logger.remove()
logger.add(sys.stderr, level="INFO", format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>")
def update_product_analysis_table(db_path):
"""更新product_analysis表添加follows字段"""
logger.info(f"开始更新数据库: {db_path}")
try:
# 连接数据库
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
# 检查product_analysis表是否存在
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='product_analysis'")
table_exists = cursor.fetchone() is not None
if not table_exists:
logger.error("product_analysis表不存在无法更新")
conn.close()
return False
# 检查follows字段是否已经存在
cursor.execute("PRAGMA table_info(product_analysis)")
columns = [col[1] for col in cursor.fetchall()]
if 'follows' in columns:
logger.info("follows字段已经存在无需更新")
conn.close()
return True
# 添加follows字段
cursor.execute("ALTER TABLE product_analysis ADD COLUMN follows INTEGER")
conn.commit()
logger.success("成功为product_analysis表添加follows字段")
conn.close()
return True
except Exception as e:
logger.error(f"更新数据库失败: {e}")
if 'conn' in locals():
conn.close()
return False
def main():
"""主函数"""
# 获取默认数据库路径
default_db_path = os.path.join(os.path.dirname(__file__), "products.db")
logger.info("=== 数据库更新脚本开始执行 ===")
# 更新数据库
success = update_product_analysis_table(default_db_path)
if success:
logger.success("=== 数据库更新成功 ===")
sys.exit(0)
else:
logger.error("=== 数据库更新失败 ===")
sys.exit(1)
if __name__ == "__main__":
main()