Files
tophux_scrape/product/producthunt_scraper.py

407 lines
17 KiB
Python
Raw Normal View History

2025-11-23 11:15:45 +08:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
ProductHunt数据抓取器
从tophub_data.db查询包含producthunt.com的链接然后使用Playwright抓取产品信息并保存到product.db
"""
import sqlite3
import asyncio
import os
from datetime import datetime
from loguru import logger
from tqdm import tqdm
import sys
# 配置日志
logger.remove()
logger.add(sys.stderr, level="INFO", format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>")
class ProductHuntScraper:
"""ProductHunt数据抓取器"""
def __init__(self):
self.tophub_db_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "tophub_data.db")
self.product_db_path = os.path.join(os.path.dirname(__file__), "product.db")
self.product_urls = []
def query_producthunt_urls(self):
"""查询包含producthunt.com的链接"""
logger.info("正在查询tophub_data.db数据库...")
try:
conn = sqlite3.connect(self.tophub_db_path)
cursor = conn.cursor()
# 查询包含producthunt.com的链接
cursor.execute("SELECT url FROM articles WHERE url LIKE '%producthunt.com%'")
urls = [row[0] for row in cursor.fetchall()]
conn.close()
logger.success(f"找到 {len(urls)} 个包含producthunt.com的链接")
return urls
except Exception as e:
logger.error(f"查询数据库失败: {e}")
return []
def init_product_database(self):
"""初始化product.db数据库"""
logger.info("正在初始化product.db数据库...")
try:
conn = sqlite3.connect(self.product_db_path)
cursor = conn.cursor()
# 创建产品信息表
cursor.execute('''
CREATE TABLE IF NOT EXISTS products (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT NOT NULL UNIQUE,
name TEXT,
introduction TEXT,
user_count TEXT,
maker_link TEXT,
maker_statement TEXT,
created_at TEXT NOT NULL,
updated_at TEXT NOT NULL
)
''')
conn.commit()
conn.close()
logger.success("product.db数据库初始化完成")
except Exception as e:
logger.error(f"初始化数据库失败: {e}")
def check_duplicate(self, url):
"""检查URL是否已存在"""
try:
conn = sqlite3.connect(self.product_db_path)
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM products WHERE url = ?", (url,))
count = cursor.fetchone()[0]
conn.close()
return count > 0
except Exception as e:
logger.error(f"检查重复失败: {e}")
return False
def save_product_info(self, product_info):
"""保存产品信息到数据库"""
try:
conn = sqlite3.connect(self.product_db_path)
cursor = conn.cursor()
current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# 检查是否已存在
cursor.execute("SELECT id FROM products WHERE url = ?", (product_info['url'],))
existing = cursor.fetchone()
if existing:
# 更新现有记录
cursor.execute('''
UPDATE products SET
name = ?, introduction = ?, user_count = ?,
maker_link = ?, maker_statement = ?, updated_at = ?
WHERE url = ?
''', (
product_info.get('name'),
product_info.get('introduction'),
product_info.get('user_count'),
product_info.get('maker_link'),
product_info.get('maker_statement'),
current_time,
product_info['url']
))
logger.info(f"更新产品信息: {product_info.get('name', '未知')}")
else:
# 插入新记录
cursor.execute('''
INSERT INTO products
(url, name, introduction, user_count, maker_link, maker_statement, created_at, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
''', (
product_info['url'],
product_info.get('name'),
product_info.get('introduction'),
product_info.get('user_count'),
product_info.get('maker_link'),
product_info.get('maker_statement'),
current_time,
current_time
))
logger.info(f"新增产品信息: {product_info.get('name', '未知')}")
conn.commit()
conn.close()
return True
except Exception as e:
logger.error(f"保存产品信息失败: {e}")
return False
async def scrape_product_info(self, url):
"""使用Playwright抓取产品信息"""
try:
# 导入Playwright相关模块
from playwright.async_api import async_playwright
logger.info(f"开始抓取: {url}")
# 创建Playwright实例
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless=True)
page = await browser.new_page()
# 设置超时时间
page.set_default_timeout(120000) # 增加超时时间以处理Cloudflare挑战
# 导航到页面
await page.goto(url, wait_until="domcontentloaded")
# 检查是否是Cloudflare挑战页面
page_title = await page.title()
if "请稍候" in page_title or "Checking" in page_title or "Verifying" in page_title:
logger.info("检测到Cloudflare挑战页面等待验证完成...")
# 等待Cloudflare挑战完成
try:
# 等待页面标题变化或特定元素出现
await page.wait_for_function(
"""() => {
const title = document.title;
return !title.includes('请稍候') &&
!title.includes('Checking') &&
!title.includes('Verifying') &&
title !== '请稍候…';
}""",
timeout=300000 # 5分钟
)
logger.info("Cloudflare挑战已完成")
except Exception as e:
logger.warning(f"等待Cloudflare挑战超时: {e}")
# 等待页面加载
await page.wait_for_timeout(3000)
product_info = {'url': url}
# 提取产品名称 - 改进的XPath选择器
try:
# 尝试多种选择器
name_selectors = [
"xpath=//h1",
"xpath=//h1[@data-test='product-name']",
"xpath=//h1[contains(@class, 'text')]",
"xpath=//title"
]
for selector in name_selectors:
name_element = await page.query_selector(selector)
if name_element:
name_text = (await name_element.text_content()).strip()
# 过滤掉页面标题中的无关内容
if name_text and 'Product Hunt' not in name_text and len(name_text) > 5:
product_info['name'] = name_text
logger.info(f"提取到产品名称: {product_info['name']}")
break
if 'name' not in product_info:
logger.warning("未找到有效的产品名称元素")
except Exception as e:
logger.warning(f"提取产品名称失败: {e}")
# 提取产品简介 - 改进的XPath选择器
try:
intro_selectors = [
"xpath=//*[@class='relative text-16 font-normal text-gray-700']//div",
"xpath=//p[contains(@class, 'description')]",
"xpath=//div[contains(@class, 'description')]",
"xpath=//meta[@name='description']"
]
for selector in intro_selectors:
intro_element = await page.query_selector(selector)
if intro_element:
intro_text = (await intro_element.text_content()).strip()
if intro_text:
product_info['introduction'] = intro_text
logger.info(f"提取到产品简介: {product_info['introduction'][:100]}...")
break
if 'introduction' not in product_info:
logger.warning("未找到产品简介元素")
except Exception as e:
logger.warning(f"提取产品简介失败: {e}")
# 提取用户数 - 改进的XPath选择器
try:
user_count_selectors = [
"xpath=//*[@class='flex flex-row gap-2']//div/div[2]/span/p",
"xpath=//span[contains(text(), 'users')]",
"xpath=//span[contains(text(), 'upvotes')]",
"xpath=//div[contains(@class, 'stats')]"
]
for selector in user_count_selectors:
user_count_element = await page.query_selector(selector)
if user_count_element:
user_count_text = (await user_count_element.text_content()).strip()
if user_count_text:
product_info['user_count'] = user_count_text
logger.info(f"提取到用户数: {product_info['user_count']}")
break
if 'user_count' not in product_info:
logger.warning("未找到用户数元素")
except Exception as e:
logger.warning(f"提取用户数失败: {e}")
# 提取制作人链接 - 改进的XPath选择器
try:
maker_link_selectors = [
"xpath=//span[contains(@class, 'absolute')]",
"xpath=//a[contains(@href, 'hunter')]",
"xpath=//a[contains(text(), 'hunter')]",
"xpath=//a[contains(@class, 'maker')]"
]
for selector in maker_link_selectors:
maker_element = await page.query_selector(selector)
if maker_element:
# 如果是span找父级a标签
if 'span' in selector:
a_element = await maker_element.evaluate_handle('(element) => element.closest("a")')
if a_element:
maker_link = await a_element.get_attribute('href')
else:
maker_link = await maker_element.get_attribute('href')
if maker_link and not maker_link.startswith('http'):
base_url = "https://www.producthunt.com"
if maker_link.startswith('/'):
maker_link = base_url + maker_link
else:
maker_link = base_url + '/' + maker_link
if maker_link:
product_info['maker_link'] = maker_link
logger.info(f"提取到制作人链接: {maker_link}")
break
if 'maker_link' not in product_info:
logger.warning("未找到制作人链接元素")
except Exception as e:
logger.warning(f"提取制作人链接失败: {e}")
# 提取制作人发言(简化版本)
try:
if product_info.get('maker_link'):
# 在新页面中打开制作人链接
new_page = await browser.new_page()
await new_page.goto(product_info['maker_link'], wait_until="domcontentloaded")
await new_page.wait_for_timeout(5000)
# 尝试多种选择器提取发言内容
statement_selectors = [
"xpath=//*[@id='comment-4597755']/div/div[2]/div/div/div",
"xpath=//div[contains(@class, 'comment')]",
"xpath=//p[contains(@class, 'comment')]",
"xpath=//article"
]
for selector in statement_selectors:
comment_element = await new_page.query_selector(selector)
if comment_element:
statement_text = (await comment_element.text_content()).strip()
if statement_text and len(statement_text) > 10:
product_info['maker_statement'] = statement_text
logger.info(f"提取到制作人发言: {product_info['maker_statement'][:100]}...")
break
await new_page.close()
else:
logger.warning("没有制作人链接,跳过提取制作人发言")
except Exception as e:
logger.warning(f"提取制作人发言失败: {e}")
# 关闭浏览器
await browser.close()
await playwright.stop()
logger.success(f"抓取完成: {product_info.get('name', '未知')}")
return product_info
except Exception as e:
logger.error(f"抓取产品信息失败: {e}")
return {'url': url}
async def process_urls(self):
"""处理所有URL"""
# 查询URL
self.product_urls = self.query_producthunt_urls()
if not self.product_urls:
logger.warning("未找到包含producthunt.com的链接")
return
# 初始化数据库
self.init_product_database()
logger.info(f"开始处理 {len(self.product_urls)} 个产品链接")
# 创建进度条
with tqdm(total=len(self.product_urls), desc="处理进度") as pbar:
for url in self.product_urls:
try:
# 检查是否已存在
if self.check_duplicate(url):
logger.info(f"跳过已存在的链接: {url}")
pbar.update(1)
continue
# 抓取产品信息
product_info = await self.scrape_product_info(url)
# 保存到数据库
if product_info:
self.save_product_info(product_info)
pbar.update(1)
except Exception as e:
logger.error(f"处理链接失败 {url}: {e}")
pbar.update(1)
def run(self):
"""运行主程序"""
logger.info("开始ProductHunt数据抓取任务")
try:
# 运行异步任务
asyncio.run(self.process_urls())
logger.success("任务完成")
except Exception as e:
logger.error(f"程序执行失败: {e}")
def main():
"""主函数"""
scraper = ProductHuntScraper()
scraper.run()
if __name__ == "__main__":
main()