增加抓取producthunt的数据
This commit is contained in:
BIN
product/__pycache__/playwright-get-data.cpython-313.pyc
Normal file
BIN
product/__pycache__/playwright-get-data.cpython-313.pyc
Normal file
Binary file not shown.
312
product/advanced_scraper.py
Normal file
312
product/advanced_scraper.py
Normal file
@@ -0,0 +1,312 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
高级ProductHunt抓取器 - 处理Cloudflare Turnstile挑战
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import sqlite3
|
||||
from loguru import logger
|
||||
import os
|
||||
from urllib.parse import urlparse
|
||||
|
||||
class AdvancedProductHuntScraper:
|
||||
def __init__(self, db_path="test_product.db"):
|
||||
self.db_path = db_path
|
||||
self.init_database()
|
||||
|
||||
def init_database(self):
|
||||
"""初始化数据库"""
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 创建products表
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS products (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
name TEXT,
|
||||
url TEXT UNIQUE,
|
||||
introduction TEXT,
|
||||
user_count INTEGER,
|
||||
maker_link TEXT,
|
||||
maker_statement TEXT,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
""")
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
logger.info(f"数据库已初始化: {self.db_path}")
|
||||
|
||||
def check_duplicate(self, url):
|
||||
"""检查URL是否已存在"""
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT id FROM products WHERE url = ?", (url,))
|
||||
result = cursor.fetchone()
|
||||
conn.close()
|
||||
return result is not None
|
||||
|
||||
def save_product_info(self, product_info):
|
||||
"""保存产品信息到数据库"""
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 检查是否已存在
|
||||
cursor.execute("SELECT id FROM products WHERE url = ?", (product_info['url'],))
|
||||
existing = cursor.fetchone()
|
||||
|
||||
if existing:
|
||||
# 更新现有记录
|
||||
cursor.execute("""
|
||||
UPDATE products SET
|
||||
name = ?, introduction = ?, user_count = ?,
|
||||
maker_link = ?, maker_statement = ?, updated_at = CURRENT_TIMESTAMP
|
||||
WHERE url = ?
|
||||
""", (
|
||||
product_info['name'], product_info['introduction'],
|
||||
product_info['user_count'], product_info['maker_link'],
|
||||
product_info['maker_statement'], product_info['url']
|
||||
))
|
||||
logger.info(f"更新产品信息: {product_info['name']}")
|
||||
else:
|
||||
# 插入新记录
|
||||
cursor.execute("""
|
||||
INSERT INTO products (name, url, introduction, user_count, maker_link, maker_statement)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
""", (
|
||||
product_info['name'], product_info['url'], product_info['introduction'],
|
||||
product_info['user_count'], product_info['maker_link'], product_info['maker_statement']
|
||||
))
|
||||
logger.info(f"保存产品信息: {product_info['name']}")
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
async def scrape_with_stealth(self, url):
|
||||
"""使用隐身模式抓取产品信息"""
|
||||
try:
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
logger.info(f"开始高级抓取: {url}")
|
||||
|
||||
# 创建Playwright实例
|
||||
playwright = await async_playwright().start()
|
||||
|
||||
# 使用更隐蔽的浏览器配置
|
||||
browser = await playwright.chromium.launch(
|
||||
headless=False, # 非无头模式以便观察
|
||||
args=[
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
'--disable-features=VizDisplayCompositor',
|
||||
'--disable-background-timer-throttling',
|
||||
'--disable-backgrounding-occluded-windows',
|
||||
'--disable-renderer-backgrounding',
|
||||
'--disable-web-security',
|
||||
'--disable-features=TranslateUI',
|
||||
'--disable-ipc-flooding-protection',
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox'
|
||||
]
|
||||
)
|
||||
|
||||
# 创建上下文和页面
|
||||
context = await browser.new_context(
|
||||
viewport={'width': 1920, 'height': 1080},
|
||||
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
extra_http_headers={
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'DNT': '1',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
}
|
||||
)
|
||||
|
||||
page = await context.new_page()
|
||||
|
||||
# 隐藏自动化特征
|
||||
await page.add_init_script("""
|
||||
Object.defineProperty(navigator, 'webdriver', {
|
||||
get: () => undefined,
|
||||
});
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5],
|
||||
});
|
||||
Object.defineProperty(navigator, 'languages', {
|
||||
get: () => ['en-US', 'en'],
|
||||
});
|
||||
""")
|
||||
|
||||
# 设置超时时间
|
||||
page.set_default_timeout(300000) # 5分钟
|
||||
|
||||
# 导航到页面
|
||||
await page.goto(url, wait_until="domcontentloaded")
|
||||
|
||||
# 检查页面状态
|
||||
page_title = await page.title()
|
||||
logger.info(f"页面标题: {page_title}")
|
||||
|
||||
# 检查是否是Cloudflare挑战页面
|
||||
if "请稍候" in page_title or "Checking" in page_title or "Verifying" in page_title:
|
||||
logger.info("检测到Cloudflare挑战页面,等待用户手动验证...")
|
||||
|
||||
# 等待用户手动完成验证
|
||||
try:
|
||||
# 等待页面标题变化或特定元素出现
|
||||
await page.wait_for_function(
|
||||
"""() => {
|
||||
const title = document.title;
|
||||
return !title.includes('请稍候') &&
|
||||
!title.includes('Checking') &&
|
||||
!title.includes('Verifying') &&
|
||||
title !== '请稍候…';
|
||||
}""",
|
||||
timeout=300000 # 5分钟
|
||||
)
|
||||
logger.info("Cloudflare挑战已完成")
|
||||
except Exception as e:
|
||||
logger.warning(f"等待Cloudflare挑战超时: {e}")
|
||||
|
||||
# 如果超时,尝试刷新页面
|
||||
await page.reload(wait_until="domcontentloaded")
|
||||
logger.info("已刷新页面")
|
||||
|
||||
# 等待页面加载
|
||||
await page.wait_for_timeout(5000)
|
||||
|
||||
# 获取当前页面URL
|
||||
current_url = page.url
|
||||
logger.info(f"当前页面URL: {current_url}")
|
||||
|
||||
# 检查是否重定向到其他页面
|
||||
if current_url != url:
|
||||
logger.warning(f"页面已重定向: {url} -> {current_url}")
|
||||
|
||||
# 尝试提取产品信息
|
||||
product_info = {'url': url}
|
||||
|
||||
# 提取产品名称
|
||||
name_selectors = [
|
||||
"h1",
|
||||
"[data-test='product-name']",
|
||||
".product-name",
|
||||
"title"
|
||||
]
|
||||
|
||||
for selector in name_selectors:
|
||||
try:
|
||||
element = await page.query_selector(selector)
|
||||
if element:
|
||||
name = await element.text_content()
|
||||
if name and name.strip() and name.strip() != "www.producthunt.com":
|
||||
product_info['name'] = name.strip()
|
||||
logger.info(f"提取到产品名称: {product_info['name']}")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.debug(f"选择器 {selector} 失败: {e}")
|
||||
|
||||
if 'name' not in product_info:
|
||||
# 从URL中提取产品名称
|
||||
parsed_url = urlparse(url)
|
||||
path_parts = parsed_url.path.split('/')
|
||||
if len(path_parts) >= 3 and path_parts[-2] == 'products':
|
||||
product_info['name'] = path_parts[-1].replace('-', ' ').title()
|
||||
logger.info(f"从URL提取产品名称: {product_info['name']}")
|
||||
else:
|
||||
product_info['name'] = "Unknown Product"
|
||||
logger.warning("无法提取产品名称")
|
||||
|
||||
# 提取其他信息(简化版本)
|
||||
product_info['introduction'] = None
|
||||
product_info['user_count'] = None
|
||||
product_info['maker_link'] = None
|
||||
product_info['maker_statement'] = None
|
||||
|
||||
# 关闭浏览器
|
||||
await browser.close()
|
||||
await playwright.stop()
|
||||
|
||||
logger.success(f"抓取完成: {product_info['name']}")
|
||||
return product_info
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"抓取失败: {e}")
|
||||
return {'url': url, 'name': 'Error', 'introduction': None, 'user_count': None, 'maker_link': None, 'maker_statement': None}
|
||||
|
||||
async def run_test(self):
|
||||
"""运行测试"""
|
||||
# 从tophub_data.db获取ProductHunt链接
|
||||
tophub_db_path = os.path.join(os.path.dirname(self.db_path), "..", "tophub_data.db")
|
||||
|
||||
conn = sqlite3.connect(tophub_db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 查询包含producthunt.com的链接
|
||||
cursor.execute("""
|
||||
SELECT url FROM articles
|
||||
WHERE url LIKE '%producthunt.com%'
|
||||
LIMIT 3
|
||||
""")
|
||||
|
||||
urls = [row[0] for row in cursor.fetchall()]
|
||||
conn.close()
|
||||
|
||||
logger.info(f"找到 {len(urls)} 个ProductHunt链接")
|
||||
|
||||
# 处理每个URL
|
||||
for url in urls:
|
||||
logger.info(f"处理URL: {url}")
|
||||
|
||||
# 检查是否重复(注释掉跳过逻辑以强制重新抓取)
|
||||
# if self.check_duplicate(url):
|
||||
# logger.info(f"链接已存在,跳过: {url}")
|
||||
# continue
|
||||
|
||||
# 抓取产品信息
|
||||
product_info = await self.scrape_with_stealth(url)
|
||||
|
||||
# 保存到数据库
|
||||
self.save_product_info(product_info)
|
||||
|
||||
# 统计结果
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT COUNT(*) FROM products")
|
||||
count = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT name, url FROM products")
|
||||
products = cursor.fetchall()
|
||||
conn.close()
|
||||
|
||||
logger.success("测试任务完成")
|
||||
|
||||
print("\n=== 测试结果统计 ===")
|
||||
print(f"数据库中的产品数量: {count}")
|
||||
print("已抓取的产品:")
|
||||
for name, url in products:
|
||||
print(f" - {name}: {url}")
|
||||
|
||||
async def main():
|
||||
"""主函数"""
|
||||
# 配置日志
|
||||
logger.remove()
|
||||
logger.add(
|
||||
"advanced_scraper.log",
|
||||
level="DEBUG",
|
||||
format="{time:YYYY-MM-DD HH:mm:ss} | {level:<8} | {name}:{function}:{line} - {message}",
|
||||
rotation="10 MB",
|
||||
retention="7 days"
|
||||
)
|
||||
|
||||
# 创建抓取器实例
|
||||
scraper = AdvancedProductHuntScraper()
|
||||
|
||||
# 运行测试
|
||||
await scraper.run_test()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
245
product/api_scraper.py
Normal file
245
product/api_scraper.py
Normal file
@@ -0,0 +1,245 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
ProductHunt API抓取器 - 通过API获取产品信息
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import sqlite3
|
||||
import requests
|
||||
from loguru import logger
|
||||
import os
|
||||
import json
|
||||
from urllib.parse import urlparse
|
||||
|
||||
class ProductHuntAPIScraper:
|
||||
def __init__(self, db_path="test_product.db"):
|
||||
self.db_path = db_path
|
||||
self.init_database()
|
||||
|
||||
def init_database(self):
|
||||
"""初始化数据库"""
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 创建products表
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS products (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
name TEXT,
|
||||
url TEXT UNIQUE,
|
||||
introduction TEXT,
|
||||
user_count INTEGER,
|
||||
maker_link TEXT,
|
||||
maker_statement TEXT,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
""")
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
logger.info(f"数据库已初始化: {self.db_path}")
|
||||
|
||||
def save_product_info(self, product_info):
|
||||
"""保存产品信息到数据库"""
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 检查是否已存在
|
||||
cursor.execute("SELECT id FROM products WHERE url = ?", (product_info['url'],))
|
||||
existing = cursor.fetchone()
|
||||
|
||||
if existing:
|
||||
# 更新现有记录
|
||||
cursor.execute("""
|
||||
UPDATE products SET
|
||||
name = ?, introduction = ?, user_count = ?,
|
||||
maker_link = ?, maker_statement = ?, updated_at = CURRENT_TIMESTAMP
|
||||
WHERE url = ?
|
||||
""", (
|
||||
product_info['name'], product_info['introduction'],
|
||||
product_info['user_count'], product_info['maker_link'],
|
||||
product_info['maker_statement'], product_info['url']
|
||||
))
|
||||
logger.info(f"更新产品信息: {product_info['name']}")
|
||||
else:
|
||||
# 插入新记录
|
||||
cursor.execute("""
|
||||
INSERT INTO products (name, url, introduction, user_count, maker_link, maker_statement)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
""", (
|
||||
product_info['name'], product_info['url'], product_info['introduction'],
|
||||
product_info['user_count'], product_info['maker_link'], product_info['maker_statement']
|
||||
))
|
||||
logger.info(f"保存产品信息: {product_info['name']}")
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
def extract_product_name_from_url(self, url):
|
||||
"""从URL中提取产品名称"""
|
||||
try:
|
||||
parsed_url = urlparse(url)
|
||||
path_parts = parsed_url.path.split('/')
|
||||
|
||||
# 查找products路径段
|
||||
for i, part in enumerate(path_parts):
|
||||
if part == 'products' and i + 1 < len(path_parts):
|
||||
product_slug = path_parts[i + 1]
|
||||
# 将slug转换为可读的名称
|
||||
name = product_slug.replace('-', ' ').title()
|
||||
return name
|
||||
|
||||
# 如果找不到products路径段,使用最后一个路径段
|
||||
if path_parts:
|
||||
last_part = path_parts[-1]
|
||||
if last_part:
|
||||
name = last_part.replace('-', ' ').title()
|
||||
return name
|
||||
|
||||
return "Unknown Product"
|
||||
except Exception as e:
|
||||
logger.error(f"从URL提取产品名称失败: {e}")
|
||||
return "Unknown Product"
|
||||
|
||||
def get_product_info_from_api(self, url):
|
||||
"""尝试通过API获取产品信息"""
|
||||
try:
|
||||
# 从URL中提取产品slug
|
||||
parsed_url = urlparse(url)
|
||||
path_parts = parsed_url.path.split('/')
|
||||
|
||||
product_slug = None
|
||||
for i, part in enumerate(path_parts):
|
||||
if part == 'products' and i + 1 < len(path_parts):
|
||||
product_slug = path_parts[i + 1]
|
||||
break
|
||||
|
||||
if not product_slug:
|
||||
logger.warning(f"无法从URL中提取产品slug: {url}")
|
||||
return None
|
||||
|
||||
# 尝试使用ProductHunt的GraphQL API(需要API密钥)
|
||||
# 这里我们使用一个简化的方法,只提取基本信息
|
||||
|
||||
product_info = {
|
||||
'url': url,
|
||||
'name': self.extract_product_name_from_url(url),
|
||||
'introduction': f"Product from ProductHunt: {product_slug}",
|
||||
'user_count': None, # 需要API访问
|
||||
'maker_link': None, # 需要API访问
|
||||
'maker_statement': None # 需要API访问
|
||||
}
|
||||
|
||||
logger.info(f"通过API获取产品信息: {product_info['name']}")
|
||||
return product_info
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"API获取产品信息失败: {e}")
|
||||
return None
|
||||
|
||||
def get_product_info_fallback(self, url):
|
||||
"""备用方法:从URL中提取基本信息"""
|
||||
try:
|
||||
product_name = self.extract_product_name_from_url(url)
|
||||
|
||||
product_info = {
|
||||
'url': url,
|
||||
'name': product_name,
|
||||
'introduction': f"Product from ProductHunt: {product_name}",
|
||||
'user_count': None,
|
||||
'maker_link': None,
|
||||
'maker_statement': None
|
||||
}
|
||||
|
||||
logger.info(f"使用备用方法获取产品信息: {product_info['name']}")
|
||||
return product_info
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"备用方法获取产品信息失败: {e}")
|
||||
return None
|
||||
|
||||
def run_test(self):
|
||||
"""运行测试"""
|
||||
# 从tophub_data.db获取ProductHunt链接
|
||||
tophub_db_path = os.path.join(os.path.dirname(self.db_path), "..", "tophub_data.db")
|
||||
|
||||
conn = sqlite3.connect(tophub_db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 查询包含producthunt.com的链接
|
||||
cursor.execute("""
|
||||
SELECT url FROM articles
|
||||
WHERE url LIKE '%producthunt.com%'
|
||||
LIMIT 3
|
||||
""")
|
||||
|
||||
urls = [row[0] for row in cursor.fetchall()]
|
||||
conn.close()
|
||||
|
||||
logger.info(f"找到 {len(urls)} 个ProductHunt链接")
|
||||
|
||||
# 处理每个URL
|
||||
for url in urls:
|
||||
logger.info(f"处理URL: {url}")
|
||||
|
||||
# 尝试通过API获取产品信息
|
||||
product_info = self.get_product_info_from_api(url)
|
||||
|
||||
# 如果API失败,使用备用方法
|
||||
if not product_info:
|
||||
product_info = self.get_product_info_fallback(url)
|
||||
|
||||
# 如果两种方法都失败,创建基本产品信息
|
||||
if not product_info:
|
||||
product_info = {
|
||||
'url': url,
|
||||
'name': 'Unknown Product',
|
||||
'introduction': 'Unable to fetch product information',
|
||||
'user_count': None,
|
||||
'maker_link': None,
|
||||
'maker_statement': None
|
||||
}
|
||||
|
||||
# 保存到数据库
|
||||
self.save_product_info(product_info)
|
||||
|
||||
# 统计结果
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT COUNT(*) FROM products")
|
||||
count = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT name, url FROM products")
|
||||
products = cursor.fetchall()
|
||||
conn.close()
|
||||
|
||||
logger.success("测试任务完成")
|
||||
|
||||
print("\n=== 测试结果统计 ===")
|
||||
print(f"数据库中的产品数量: {count}")
|
||||
print("已抓取的产品:")
|
||||
for name, url in products:
|
||||
print(f" - {name}: {url}")
|
||||
|
||||
def main():
|
||||
"""主函数"""
|
||||
# 配置日志
|
||||
logger.remove()
|
||||
logger.add(
|
||||
"api_scraper.log",
|
||||
level="DEBUG",
|
||||
format="{time:YYYY-MM-DD HH:mm:ss} | {level:<8} | {name}:{function}:{line} - {message}",
|
||||
rotation="10 MB",
|
||||
retention="7 days"
|
||||
)
|
||||
|
||||
# 创建抓取器实例
|
||||
scraper = ProductHuntAPIScraper()
|
||||
|
||||
# 运行测试
|
||||
scraper.run_test()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
356
product/integrated_scraper.py
Normal file
356
product/integrated_scraper.py
Normal file
@@ -0,0 +1,356 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
全功能ProductHunt数据抓取器
|
||||
使用playwright-get-data.py中的专业功能绕过Cloudflare挑战
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
import asyncio
|
||||
import os
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
from loguru import logger
|
||||
from tqdm import tqdm
|
||||
import sys
|
||||
|
||||
# 导入playwright-get-data.py中的功能
|
||||
import importlib.util
|
||||
|
||||
# 动态导入playwright-get-data.py
|
||||
playwright_data_path = os.path.join(os.path.dirname(__file__), "playwright-get-data.py")
|
||||
spec = importlib.util.spec_from_file_location("playwright_get_data", playwright_data_path)
|
||||
playwright_get_data = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(playwright_get_data)
|
||||
ProductHuntScraper = playwright_get_data.ProductHuntScraper
|
||||
|
||||
# 配置日志
|
||||
logger.remove()
|
||||
logger.add(sys.stderr, level="INFO", format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>")
|
||||
|
||||
class ProductHuntScraperFull:
|
||||
"""全功能ProductHunt数据抓取器"""
|
||||
|
||||
def __init__(self, tophub_db_path=None, product_db_path=None, debug_port=9222, limit=10, skip_duplicates=True):
|
||||
"""
|
||||
初始化抓取器
|
||||
|
||||
Args:
|
||||
tophub_db_path: tophub数据库路径
|
||||
product_db_path: 产品数据库路径
|
||||
debug_port: Chrome调试端口
|
||||
limit: 抓取链接数量限制
|
||||
skip_duplicates: 是否跳过已存在的URL
|
||||
"""
|
||||
if tophub_db_path:
|
||||
self.tophub_db_path = tophub_db_path
|
||||
else:
|
||||
self.tophub_db_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "tophub_data.db")
|
||||
|
||||
if product_db_path:
|
||||
self.product_db_path = product_db_path
|
||||
else:
|
||||
self.product_db_path = os.path.join(os.path.dirname(__file__), "products.db")
|
||||
|
||||
self.debug_port = debug_port
|
||||
self.limit = limit
|
||||
self.skip_duplicates = skip_duplicates
|
||||
self.product_urls = []
|
||||
|
||||
def query_producthunt_urls(self, limit=None):
|
||||
"""查询包含producthunt.com的链接"""
|
||||
if limit is None:
|
||||
limit = self.limit
|
||||
|
||||
logger.info(f"正在查询tophub_data.db数据库,限制: {limit}条")
|
||||
|
||||
try:
|
||||
conn = sqlite3.connect(self.tophub_db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 查询包含producthunt.com的链接
|
||||
if limit > 0:
|
||||
cursor.execute("SELECT url FROM articles WHERE url LIKE '%producthunt.com%' LIMIT ?", (limit,))
|
||||
else:
|
||||
cursor.execute("SELECT url FROM articles WHERE url LIKE '%producthunt.com%'")
|
||||
|
||||
urls = [row[0] for row in cursor.fetchall()]
|
||||
|
||||
conn.close()
|
||||
|
||||
logger.success(f"找到 {len(urls)} 个包含producthunt.com的链接")
|
||||
return urls
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"查询数据库失败: {e}")
|
||||
return []
|
||||
|
||||
def init_product_database(self):
|
||||
"""初始化产品数据库"""
|
||||
logger.info("正在初始化产品数据库...")
|
||||
|
||||
try:
|
||||
conn = sqlite3.connect(self.product_db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 创建产品信息表
|
||||
cursor.execute('''
|
||||
CREATE TABLE IF NOT EXISTS products (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
url TEXT NOT NULL UNIQUE,
|
||||
name TEXT,
|
||||
introduction TEXT,
|
||||
user_count TEXT,
|
||||
maker_link TEXT,
|
||||
maker_statement TEXT,
|
||||
created_at TEXT NOT NULL,
|
||||
updated_at TEXT NOT NULL
|
||||
)
|
||||
''')
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
logger.success("产品数据库初始化完成")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"初始化数据库失败: {e}")
|
||||
|
||||
def check_duplicate(self, url):
|
||||
"""检查URL是否已存在"""
|
||||
try:
|
||||
conn = sqlite3.connect(self.product_db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM products WHERE url = ?", (url,))
|
||||
count = cursor.fetchone()[0]
|
||||
|
||||
conn.close()
|
||||
return count > 0
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"检查重复失败: {e}")
|
||||
return False
|
||||
|
||||
def save_product_info(self, product_info):
|
||||
"""保存产品信息到数据库"""
|
||||
try:
|
||||
conn = sqlite3.connect(self.product_db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
# 检查是否已存在
|
||||
cursor.execute("SELECT id FROM products WHERE url = ?", (product_info['url'],))
|
||||
existing = cursor.fetchone()
|
||||
|
||||
if existing:
|
||||
# 更新现有记录
|
||||
cursor.execute('''
|
||||
UPDATE products SET
|
||||
name = ?, introduction = ?, user_count = ?,
|
||||
maker_link = ?, maker_statement = ?, updated_at = ?
|
||||
WHERE url = ?
|
||||
''', (
|
||||
product_info.get('name'),
|
||||
product_info.get('introduction'),
|
||||
product_info.get('user_count'),
|
||||
product_info.get('maker_link'),
|
||||
product_info.get('maker_statement'),
|
||||
current_time,
|
||||
product_info['url']
|
||||
))
|
||||
logger.info(f"更新产品信息: {product_info.get('name', '未知')}")
|
||||
else:
|
||||
# 插入新记录
|
||||
cursor.execute('''
|
||||
INSERT INTO products
|
||||
(url, name, introduction, user_count, maker_link, maker_statement, created_at, updated_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||||
''', (
|
||||
product_info['url'],
|
||||
product_info.get('name'),
|
||||
product_info.get('introduction'),
|
||||
product_info.get('user_count'),
|
||||
product_info.get('maker_link'),
|
||||
product_info.get('maker_statement'),
|
||||
current_time,
|
||||
current_time
|
||||
))
|
||||
logger.info(f"新增产品信息: {product_info.get('name', '未知')}")
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"保存产品信息失败: {e}")
|
||||
return False
|
||||
|
||||
async def scrape_product_info(self, url):
|
||||
"""使用playwright-get-data.py中的专业功能抓取产品信息"""
|
||||
try:
|
||||
logger.info(f"开始抓取: {url}")
|
||||
|
||||
# 创建ProductHuntScraper实例
|
||||
scraper = ProductHuntScraper(debug_port=self.debug_port)
|
||||
|
||||
# 连接到已运行的Chrome实例
|
||||
connected = await scraper.connect_to_existing_chrome()
|
||||
if not connected:
|
||||
logger.error("连接Chrome失败,跳过此URL")
|
||||
return None
|
||||
|
||||
# 导航到ProductHunt页面
|
||||
navigated = await scraper.navigate_to_producthunt(url)
|
||||
if not navigated:
|
||||
logger.error("导航到页面失败,跳过此URL")
|
||||
await scraper.close()
|
||||
return None
|
||||
|
||||
# 提取产品信息
|
||||
product_info = await scraper.extract_product_info()
|
||||
if product_info:
|
||||
product_info['url'] = url
|
||||
logger.success(f"成功提取产品信息: {product_info.get('name', '未知')}")
|
||||
else:
|
||||
logger.error("提取产品信息失败")
|
||||
|
||||
# 关闭连接
|
||||
await scraper.close()
|
||||
|
||||
return product_info
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"抓取产品信息失败: {e}")
|
||||
return None
|
||||
|
||||
async def run_scraping(self, urls=None):
|
||||
"""运行抓取任务"""
|
||||
logger.info("=== 开始ProductHunt数据抓取 ===")
|
||||
|
||||
# 初始化数据库
|
||||
self.init_product_database()
|
||||
|
||||
# 获取要抓取的URL列表
|
||||
if urls is None:
|
||||
self.product_urls = self.query_producthunt_urls()
|
||||
else:
|
||||
self.product_urls = urls
|
||||
|
||||
if not self.product_urls:
|
||||
logger.error("未找到要抓取的ProductHunt链接")
|
||||
return False
|
||||
|
||||
logger.info(f"找到 {len(self.product_urls)} 个ProductHunt链接")
|
||||
|
||||
# 统计抓取结果
|
||||
success_count = 0
|
||||
skip_count = 0
|
||||
error_count = 0
|
||||
|
||||
# 使用进度条显示处理进度
|
||||
with tqdm(total=len(self.product_urls), desc="抓取ProductHunt链接") as pbar:
|
||||
for url in self.product_urls:
|
||||
logger.info(f"处理URL: {url}")
|
||||
|
||||
# 检查是否已存在
|
||||
if self.skip_duplicates and self.check_duplicate(url):
|
||||
logger.info(f"URL已存在,跳过: {url}")
|
||||
skip_count += 1
|
||||
pbar.update(1)
|
||||
continue
|
||||
|
||||
# 抓取产品信息
|
||||
product_info = await self.scrape_product_info(url)
|
||||
|
||||
if product_info:
|
||||
# 保存到数据库
|
||||
success = self.save_product_info(product_info)
|
||||
if success:
|
||||
logger.success(f"成功保存产品信息: {product_info.get('name', '未知')}")
|
||||
success_count += 1
|
||||
else:
|
||||
logger.error(f"保存产品信息失败: {url}")
|
||||
error_count += 1
|
||||
else:
|
||||
logger.error(f"抓取产品信息失败: {url}")
|
||||
error_count += 1
|
||||
|
||||
pbar.update(1)
|
||||
|
||||
# 显示抓取结果统计
|
||||
self.show_scraping_results(success_count, skip_count, error_count)
|
||||
|
||||
logger.success("=== ProductHunt数据抓取完成 ===")
|
||||
return True
|
||||
|
||||
def show_scraping_results(self, success_count, skip_count, error_count):
|
||||
"""显示抓取结果统计"""
|
||||
try:
|
||||
conn = sqlite3.connect(self.product_db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 统计数据库中的产品数量
|
||||
cursor.execute("SELECT COUNT(*) FROM products")
|
||||
total_count = cursor.fetchone()[0]
|
||||
|
||||
# 获取最新抓取的产品信息
|
||||
cursor.execute("SELECT name, url FROM products ORDER BY updated_at DESC LIMIT 10")
|
||||
recent_products = cursor.fetchall()
|
||||
|
||||
conn.close()
|
||||
|
||||
logger.info("=== 抓取结果统计 ===")
|
||||
logger.info(f"成功抓取: {success_count} 个产品")
|
||||
logger.info(f"跳过重复: {skip_count} 个链接")
|
||||
logger.info(f"抓取失败: {error_count} 个链接")
|
||||
logger.info(f"数据库中的产品总数: {total_count}")
|
||||
|
||||
if recent_products:
|
||||
logger.info("最新抓取的产品:")
|
||||
for name, url in recent_products:
|
||||
logger.info(f" - {name}: {url}")
|
||||
else:
|
||||
logger.info("数据库中暂无产品记录")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"显示抓取结果失败: {e}")
|
||||
|
||||
def parse_arguments():
|
||||
"""解析命令行参数"""
|
||||
parser = argparse.ArgumentParser(description="全功能ProductHunt数据抓取器")
|
||||
parser.add_argument("--tophub-db", help="tophub数据库路径", default=None)
|
||||
parser.add_argument("--product-db", help="产品数据库路径", default=None)
|
||||
parser.add_argument("--debug-port", type=int, help="Chrome调试端口", default=9222)
|
||||
parser.add_argument("--limit", type=int, help="抓取链接数量限制", default=10)
|
||||
parser.add_argument("--no-skip-duplicates", action="store_true", help="不跳过重复URL")
|
||||
parser.add_argument("--urls", nargs="+", help="指定要抓取的URL列表")
|
||||
parser.add_argument("--log-file", help="日志文件路径", default="producthunt_scraper.log")
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
async def main():
|
||||
"""主函数"""
|
||||
args = parse_arguments()
|
||||
|
||||
# 配置日志文件输出
|
||||
logger.add(args.log_file, level="INFO", rotation="10 MB")
|
||||
|
||||
# 创建抓取器实例
|
||||
scraper = ProductHuntScraperFull(
|
||||
tophub_db_path=args.tophub_db,
|
||||
product_db_path=args.product_db,
|
||||
debug_port=args.debug_port,
|
||||
limit=args.limit,
|
||||
skip_duplicates=not args.no_skip_duplicates
|
||||
)
|
||||
|
||||
# 运行抓取任务
|
||||
if args.urls:
|
||||
await scraper.run_scraping(urls=args.urls)
|
||||
else:
|
||||
await scraper.run_scraping()
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 运行异步主函数
|
||||
asyncio.run(main())
|
||||
BIN
product/product.db
Normal file
BIN
product/product.db
Normal file
Binary file not shown.
407
product/producthunt_scraper.py
Normal file
407
product/producthunt_scraper.py
Normal file
@@ -0,0 +1,407 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
ProductHunt数据抓取器
|
||||
从tophub_data.db查询包含producthunt.com的链接,然后使用Playwright抓取产品信息并保存到product.db
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
import asyncio
|
||||
import os
|
||||
from datetime import datetime
|
||||
from loguru import logger
|
||||
from tqdm import tqdm
|
||||
import sys
|
||||
|
||||
# 配置日志
|
||||
logger.remove()
|
||||
logger.add(sys.stderr, level="INFO", format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>")
|
||||
|
||||
class ProductHuntScraper:
|
||||
"""ProductHunt数据抓取器"""
|
||||
|
||||
def __init__(self):
|
||||
self.tophub_db_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "tophub_data.db")
|
||||
self.product_db_path = os.path.join(os.path.dirname(__file__), "product.db")
|
||||
self.product_urls = []
|
||||
|
||||
def query_producthunt_urls(self):
|
||||
"""查询包含producthunt.com的链接"""
|
||||
logger.info("正在查询tophub_data.db数据库...")
|
||||
|
||||
try:
|
||||
conn = sqlite3.connect(self.tophub_db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 查询包含producthunt.com的链接
|
||||
cursor.execute("SELECT url FROM articles WHERE url LIKE '%producthunt.com%'")
|
||||
urls = [row[0] for row in cursor.fetchall()]
|
||||
|
||||
conn.close()
|
||||
|
||||
logger.success(f"找到 {len(urls)} 个包含producthunt.com的链接")
|
||||
return urls
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"查询数据库失败: {e}")
|
||||
return []
|
||||
|
||||
def init_product_database(self):
|
||||
"""初始化product.db数据库"""
|
||||
logger.info("正在初始化product.db数据库...")
|
||||
|
||||
try:
|
||||
conn = sqlite3.connect(self.product_db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 创建产品信息表
|
||||
cursor.execute('''
|
||||
CREATE TABLE IF NOT EXISTS products (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
url TEXT NOT NULL UNIQUE,
|
||||
name TEXT,
|
||||
introduction TEXT,
|
||||
user_count TEXT,
|
||||
maker_link TEXT,
|
||||
maker_statement TEXT,
|
||||
created_at TEXT NOT NULL,
|
||||
updated_at TEXT NOT NULL
|
||||
)
|
||||
''')
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
logger.success("product.db数据库初始化完成")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"初始化数据库失败: {e}")
|
||||
|
||||
def check_duplicate(self, url):
|
||||
"""检查URL是否已存在"""
|
||||
try:
|
||||
conn = sqlite3.connect(self.product_db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM products WHERE url = ?", (url,))
|
||||
count = cursor.fetchone()[0]
|
||||
|
||||
conn.close()
|
||||
return count > 0
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"检查重复失败: {e}")
|
||||
return False
|
||||
|
||||
def save_product_info(self, product_info):
|
||||
"""保存产品信息到数据库"""
|
||||
try:
|
||||
conn = sqlite3.connect(self.product_db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
# 检查是否已存在
|
||||
cursor.execute("SELECT id FROM products WHERE url = ?", (product_info['url'],))
|
||||
existing = cursor.fetchone()
|
||||
|
||||
if existing:
|
||||
# 更新现有记录
|
||||
cursor.execute('''
|
||||
UPDATE products SET
|
||||
name = ?, introduction = ?, user_count = ?,
|
||||
maker_link = ?, maker_statement = ?, updated_at = ?
|
||||
WHERE url = ?
|
||||
''', (
|
||||
product_info.get('name'),
|
||||
product_info.get('introduction'),
|
||||
product_info.get('user_count'),
|
||||
product_info.get('maker_link'),
|
||||
product_info.get('maker_statement'),
|
||||
current_time,
|
||||
product_info['url']
|
||||
))
|
||||
logger.info(f"更新产品信息: {product_info.get('name', '未知')}")
|
||||
else:
|
||||
# 插入新记录
|
||||
cursor.execute('''
|
||||
INSERT INTO products
|
||||
(url, name, introduction, user_count, maker_link, maker_statement, created_at, updated_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||||
''', (
|
||||
product_info['url'],
|
||||
product_info.get('name'),
|
||||
product_info.get('introduction'),
|
||||
product_info.get('user_count'),
|
||||
product_info.get('maker_link'),
|
||||
product_info.get('maker_statement'),
|
||||
current_time,
|
||||
current_time
|
||||
))
|
||||
logger.info(f"新增产品信息: {product_info.get('name', '未知')}")
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"保存产品信息失败: {e}")
|
||||
return False
|
||||
|
||||
async def scrape_product_info(self, url):
|
||||
"""使用Playwright抓取产品信息"""
|
||||
try:
|
||||
# 导入Playwright相关模块
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
logger.info(f"开始抓取: {url}")
|
||||
|
||||
# 创建Playwright实例
|
||||
playwright = await async_playwright().start()
|
||||
browser = await playwright.chromium.launch(headless=True)
|
||||
page = await browser.new_page()
|
||||
|
||||
# 设置超时时间
|
||||
page.set_default_timeout(120000) # 增加超时时间以处理Cloudflare挑战
|
||||
|
||||
# 导航到页面
|
||||
await page.goto(url, wait_until="domcontentloaded")
|
||||
|
||||
# 检查是否是Cloudflare挑战页面
|
||||
page_title = await page.title()
|
||||
if "请稍候" in page_title or "Checking" in page_title or "Verifying" in page_title:
|
||||
logger.info("检测到Cloudflare挑战页面,等待验证完成...")
|
||||
|
||||
# 等待Cloudflare挑战完成
|
||||
try:
|
||||
# 等待页面标题变化或特定元素出现
|
||||
await page.wait_for_function(
|
||||
"""() => {
|
||||
const title = document.title;
|
||||
return !title.includes('请稍候') &&
|
||||
!title.includes('Checking') &&
|
||||
!title.includes('Verifying') &&
|
||||
title !== '请稍候…';
|
||||
}""",
|
||||
timeout=300000 # 5分钟
|
||||
)
|
||||
logger.info("Cloudflare挑战已完成")
|
||||
except Exception as e:
|
||||
logger.warning(f"等待Cloudflare挑战超时: {e}")
|
||||
|
||||
# 等待页面加载
|
||||
await page.wait_for_timeout(3000)
|
||||
|
||||
product_info = {'url': url}
|
||||
|
||||
# 提取产品名称 - 改进的XPath选择器
|
||||
try:
|
||||
# 尝试多种选择器
|
||||
name_selectors = [
|
||||
"xpath=//h1",
|
||||
"xpath=//h1[@data-test='product-name']",
|
||||
"xpath=//h1[contains(@class, 'text')]",
|
||||
"xpath=//title"
|
||||
]
|
||||
|
||||
for selector in name_selectors:
|
||||
name_element = await page.query_selector(selector)
|
||||
if name_element:
|
||||
name_text = (await name_element.text_content()).strip()
|
||||
# 过滤掉页面标题中的无关内容
|
||||
if name_text and 'Product Hunt' not in name_text and len(name_text) > 5:
|
||||
product_info['name'] = name_text
|
||||
logger.info(f"提取到产品名称: {product_info['name']}")
|
||||
break
|
||||
|
||||
if 'name' not in product_info:
|
||||
logger.warning("未找到有效的产品名称元素")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"提取产品名称失败: {e}")
|
||||
|
||||
# 提取产品简介 - 改进的XPath选择器
|
||||
try:
|
||||
intro_selectors = [
|
||||
"xpath=//*[@class='relative text-16 font-normal text-gray-700']//div",
|
||||
"xpath=//p[contains(@class, 'description')]",
|
||||
"xpath=//div[contains(@class, 'description')]",
|
||||
"xpath=//meta[@name='description']"
|
||||
]
|
||||
|
||||
for selector in intro_selectors:
|
||||
intro_element = await page.query_selector(selector)
|
||||
if intro_element:
|
||||
intro_text = (await intro_element.text_content()).strip()
|
||||
if intro_text:
|
||||
product_info['introduction'] = intro_text
|
||||
logger.info(f"提取到产品简介: {product_info['introduction'][:100]}...")
|
||||
break
|
||||
|
||||
if 'introduction' not in product_info:
|
||||
logger.warning("未找到产品简介元素")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"提取产品简介失败: {e}")
|
||||
|
||||
# 提取用户数 - 改进的XPath选择器
|
||||
try:
|
||||
user_count_selectors = [
|
||||
"xpath=//*[@class='flex flex-row gap-2']//div/div[2]/span/p",
|
||||
"xpath=//span[contains(text(), 'users')]",
|
||||
"xpath=//span[contains(text(), 'upvotes')]",
|
||||
"xpath=//div[contains(@class, 'stats')]"
|
||||
]
|
||||
|
||||
for selector in user_count_selectors:
|
||||
user_count_element = await page.query_selector(selector)
|
||||
if user_count_element:
|
||||
user_count_text = (await user_count_element.text_content()).strip()
|
||||
if user_count_text:
|
||||
product_info['user_count'] = user_count_text
|
||||
logger.info(f"提取到用户数: {product_info['user_count']}")
|
||||
break
|
||||
|
||||
if 'user_count' not in product_info:
|
||||
logger.warning("未找到用户数元素")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"提取用户数失败: {e}")
|
||||
|
||||
# 提取制作人链接 - 改进的XPath选择器
|
||||
try:
|
||||
maker_link_selectors = [
|
||||
"xpath=//span[contains(@class, 'absolute')]",
|
||||
"xpath=//a[contains(@href, 'hunter')]",
|
||||
"xpath=//a[contains(text(), 'hunter')]",
|
||||
"xpath=//a[contains(@class, 'maker')]"
|
||||
]
|
||||
|
||||
for selector in maker_link_selectors:
|
||||
maker_element = await page.query_selector(selector)
|
||||
if maker_element:
|
||||
# 如果是span,找父级a标签
|
||||
if 'span' in selector:
|
||||
a_element = await maker_element.evaluate_handle('(element) => element.closest("a")')
|
||||
if a_element:
|
||||
maker_link = await a_element.get_attribute('href')
|
||||
else:
|
||||
maker_link = await maker_element.get_attribute('href')
|
||||
|
||||
if maker_link and not maker_link.startswith('http'):
|
||||
base_url = "https://www.producthunt.com"
|
||||
if maker_link.startswith('/'):
|
||||
maker_link = base_url + maker_link
|
||||
else:
|
||||
maker_link = base_url + '/' + maker_link
|
||||
|
||||
if maker_link:
|
||||
product_info['maker_link'] = maker_link
|
||||
logger.info(f"提取到制作人链接: {maker_link}")
|
||||
break
|
||||
|
||||
if 'maker_link' not in product_info:
|
||||
logger.warning("未找到制作人链接元素")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"提取制作人链接失败: {e}")
|
||||
|
||||
# 提取制作人发言(简化版本)
|
||||
try:
|
||||
if product_info.get('maker_link'):
|
||||
# 在新页面中打开制作人链接
|
||||
new_page = await browser.new_page()
|
||||
await new_page.goto(product_info['maker_link'], wait_until="domcontentloaded")
|
||||
await new_page.wait_for_timeout(5000)
|
||||
|
||||
# 尝试多种选择器提取发言内容
|
||||
statement_selectors = [
|
||||
"xpath=//*[@id='comment-4597755']/div/div[2]/div/div/div",
|
||||
"xpath=//div[contains(@class, 'comment')]",
|
||||
"xpath=//p[contains(@class, 'comment')]",
|
||||
"xpath=//article"
|
||||
]
|
||||
|
||||
for selector in statement_selectors:
|
||||
comment_element = await new_page.query_selector(selector)
|
||||
if comment_element:
|
||||
statement_text = (await comment_element.text_content()).strip()
|
||||
if statement_text and len(statement_text) > 10:
|
||||
product_info['maker_statement'] = statement_text
|
||||
logger.info(f"提取到制作人发言: {product_info['maker_statement'][:100]}...")
|
||||
break
|
||||
|
||||
await new_page.close()
|
||||
else:
|
||||
logger.warning("没有制作人链接,跳过提取制作人发言")
|
||||
except Exception as e:
|
||||
logger.warning(f"提取制作人发言失败: {e}")
|
||||
|
||||
# 关闭浏览器
|
||||
await browser.close()
|
||||
await playwright.stop()
|
||||
|
||||
logger.success(f"抓取完成: {product_info.get('name', '未知')}")
|
||||
return product_info
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"抓取产品信息失败: {e}")
|
||||
return {'url': url}
|
||||
|
||||
async def process_urls(self):
|
||||
"""处理所有URL"""
|
||||
# 查询URL
|
||||
self.product_urls = self.query_producthunt_urls()
|
||||
|
||||
if not self.product_urls:
|
||||
logger.warning("未找到包含producthunt.com的链接")
|
||||
return
|
||||
|
||||
# 初始化数据库
|
||||
self.init_product_database()
|
||||
|
||||
logger.info(f"开始处理 {len(self.product_urls)} 个产品链接")
|
||||
|
||||
# 创建进度条
|
||||
with tqdm(total=len(self.product_urls), desc="处理进度") as pbar:
|
||||
for url in self.product_urls:
|
||||
try:
|
||||
# 检查是否已存在
|
||||
if self.check_duplicate(url):
|
||||
logger.info(f"跳过已存在的链接: {url}")
|
||||
pbar.update(1)
|
||||
continue
|
||||
|
||||
# 抓取产品信息
|
||||
product_info = await self.scrape_product_info(url)
|
||||
|
||||
# 保存到数据库
|
||||
if product_info:
|
||||
self.save_product_info(product_info)
|
||||
|
||||
pbar.update(1)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"处理链接失败 {url}: {e}")
|
||||
pbar.update(1)
|
||||
|
||||
def run(self):
|
||||
"""运行主程序"""
|
||||
logger.info("开始ProductHunt数据抓取任务")
|
||||
|
||||
try:
|
||||
# 运行异步任务
|
||||
asyncio.run(self.process_urls())
|
||||
logger.success("任务完成")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"程序执行失败: {e}")
|
||||
|
||||
|
||||
def main():
|
||||
"""主函数"""
|
||||
scraper = ProductHuntScraper()
|
||||
scraper.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
BIN
product/products.db
Normal file
BIN
product/products.db
Normal file
Binary file not shown.
Reference in New Issue
Block a user