integrated_scraper.py 和 product_ai_analysis.py 两个文件合并
This commit is contained in:
127
product/run_system.py
Normal file
127
product/run_system.py
Normal file
@@ -0,0 +1,127 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
全功能产品系统运行脚本
|
||||
提供简化的命令行界面
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
from loguru import logger
|
||||
|
||||
# 导入主系统
|
||||
from integrated_product_system import IntegratedProductSystem
|
||||
from config import DATABASE_CONFIG, CHROME_CONFIG, AI_CONFIG, SCRAPING_CONFIG, LOGGING_CONFIG, ANALYSIS_CONFIG
|
||||
|
||||
|
||||
def setup_logging(log_file=None, log_level="INFO"):
|
||||
"""设置日志配置"""
|
||||
if log_file is None:
|
||||
log_file = LOGGING_CONFIG['log_file']
|
||||
|
||||
logger.remove()
|
||||
logger.add(sys.stderr, level=log_level, format=LOGGING_CONFIG['log_format'])
|
||||
logger.add(log_file, level=log_level, rotation=LOGGING_CONFIG['log_rotation'])
|
||||
|
||||
logger.info("日志系统初始化完成")
|
||||
|
||||
|
||||
def print_system_info():
|
||||
"""打印系统信息"""
|
||||
logger.info("=== 全功能产品抓取与分析系统 ===")
|
||||
logger.info(f"数据库路径: {DATABASE_CONFIG['product_db_path']}")
|
||||
logger.info(f"Chrome调试端口: {CHROME_CONFIG['debug_port']}")
|
||||
logger.info(f"AI模型: {AI_CONFIG['model']}")
|
||||
logger.info(f"API地址: {AI_CONFIG['api_url']}")
|
||||
logger.info("=" * 40)
|
||||
|
||||
|
||||
async def run_scraping_mode(args):
|
||||
"""运行抓取模式"""
|
||||
logger.info("运行抓取模式...")
|
||||
|
||||
system = IntegratedProductSystem(
|
||||
tophub_db_path=args.tophub_db or DATABASE_CONFIG['tophub_db_path'],
|
||||
product_db_path=args.product_db or DATABASE_CONFIG['product_db_path'],
|
||||
debug_port=args.debug_port or CHROME_CONFIG['debug_port'],
|
||||
limit=args.limit or SCRAPING_CONFIG['default_limit'],
|
||||
skip_duplicates=args.skip_duplicates if hasattr(args, 'skip_duplicates') else SCRAPING_CONFIG['skip_duplicates']
|
||||
)
|
||||
|
||||
# 初始化数据库
|
||||
system.init_database()
|
||||
|
||||
# 运行抓取
|
||||
await system.run_scraping(urls=args.urls)
|
||||
|
||||
|
||||
async def run_analysis_mode(args):
|
||||
"""运行分析模式"""
|
||||
logger.info("运行分析模式...")
|
||||
|
||||
system = IntegratedProductSystem(
|
||||
product_db_path=args.product_db or DATABASE_CONFIG['product_db_path']
|
||||
)
|
||||
|
||||
# 初始化数据库
|
||||
system.init_database()
|
||||
|
||||
# 运行分析
|
||||
system.analyze_products(max_products=args.max_products)
|
||||
|
||||
|
||||
async def run_full_mode(args):
|
||||
"""运行完整模式(抓取+分析)"""
|
||||
logger.info("运行完整模式(抓取+分析)...")
|
||||
|
||||
system = IntegratedProductSystem(
|
||||
tophub_db_path=args.tophub_db or DATABASE_CONFIG['tophub_db_path'],
|
||||
product_db_path=args.product_db or DATABASE_CONFIG['product_db_path'],
|
||||
debug_port=args.debug_port or CHROME_CONFIG['debug_port'],
|
||||
limit=args.limit or SCRAPING_CONFIG['default_limit'],
|
||||
skip_duplicates=args.skip_duplicates if hasattr(args, 'skip_duplicates') else SCRAPING_CONFIG['skip_duplicates']
|
||||
)
|
||||
|
||||
# 运行完整工作流程
|
||||
system.run_full_workflow(max_products=args.max_products)
|
||||
|
||||
|
||||
def main():
|
||||
"""主函数"""
|
||||
parser = argparse.ArgumentParser(description="全功能产品抓取与分析系统")
|
||||
|
||||
# 通用参数
|
||||
parser.add_argument("--mode", choices=["scraping", "analysis", "full"], default="full",
|
||||
help="运行模式: scraping(仅抓取), analysis(仅分析), full(抓取+分析)")
|
||||
parser.add_argument("--tophub-db", help="tophub数据库路径")
|
||||
parser.add_argument("--product-db", help="产品数据库路径")
|
||||
parser.add_argument("--debug-port", type=int, help="Chrome调试端口")
|
||||
parser.add_argument("--limit", type=int, help="抓取链接数量限制")
|
||||
parser.add_argument("--max-products", type=int, help="最大分析产品数量")
|
||||
parser.add_argument("--log-file", help="日志文件路径")
|
||||
parser.add_argument("--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
||||
default="INFO", help="日志级别")
|
||||
parser.add_argument("--no-skip-duplicates", action="store_true", help="不跳过重复URL")
|
||||
parser.add_argument("--urls", nargs="+", help="指定要抓取的URL列表")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# 设置日志
|
||||
setup_logging(args.log_file, args.log_level)
|
||||
|
||||
# 打印系统信息
|
||||
print_system_info()
|
||||
|
||||
# 根据模式运行
|
||||
if args.mode == "scraping":
|
||||
asyncio.run(run_scraping_mode(args))
|
||||
elif args.mode == "analysis":
|
||||
asyncio.run(run_analysis_mode(args))
|
||||
else: # full mode
|
||||
asyncio.run(run_full_mode(args))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user