Files
tophux_scrape/product/producthunt_scraper.py

407 lines
17 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
ProductHunt数据抓取器
从tophub_data.db查询包含producthunt.com的链接然后使用Playwright抓取产品信息并保存到product.db
"""
import sqlite3
import asyncio
import os
from datetime import datetime
from loguru import logger
from tqdm import tqdm
import sys
# 配置日志
logger.remove()
logger.add(sys.stderr, level="INFO", format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>")
class ProductHuntScraper:
"""ProductHunt数据抓取器"""
def __init__(self):
self.tophub_db_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "tophub_data.db")
self.product_db_path = os.path.join(os.path.dirname(__file__), "product.db")
self.product_urls = []
def query_producthunt_urls(self):
"""查询包含producthunt.com的链接"""
logger.info("正在查询tophub_data.db数据库...")
try:
conn = sqlite3.connect(self.tophub_db_path)
cursor = conn.cursor()
# 查询包含producthunt.com的链接
cursor.execute("SELECT url FROM articles WHERE url LIKE '%producthunt.com%'")
urls = [row[0] for row in cursor.fetchall()]
conn.close()
logger.success(f"找到 {len(urls)} 个包含producthunt.com的链接")
return urls
except Exception as e:
logger.error(f"查询数据库失败: {e}")
return []
def init_product_database(self):
"""初始化product.db数据库"""
logger.info("正在初始化product.db数据库...")
try:
conn = sqlite3.connect(self.product_db_path)
cursor = conn.cursor()
# 创建产品信息表
cursor.execute('''
CREATE TABLE IF NOT EXISTS products (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT NOT NULL UNIQUE,
name TEXT,
introduction TEXT,
user_count TEXT,
maker_link TEXT,
maker_statement TEXT,
created_at TEXT NOT NULL,
updated_at TEXT NOT NULL
)
''')
conn.commit()
conn.close()
logger.success("product.db数据库初始化完成")
except Exception as e:
logger.error(f"初始化数据库失败: {e}")
def check_duplicate(self, url):
"""检查URL是否已存在"""
try:
conn = sqlite3.connect(self.product_db_path)
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM products WHERE url = ?", (url,))
count = cursor.fetchone()[0]
conn.close()
return count > 0
except Exception as e:
logger.error(f"检查重复失败: {e}")
return False
def save_product_info(self, product_info):
"""保存产品信息到数据库"""
try:
conn = sqlite3.connect(self.product_db_path)
cursor = conn.cursor()
current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# 检查是否已存在
cursor.execute("SELECT id FROM products WHERE url = ?", (product_info['url'],))
existing = cursor.fetchone()
if existing:
# 更新现有记录
cursor.execute('''
UPDATE products SET
name = ?, introduction = ?, user_count = ?,
maker_link = ?, maker_statement = ?, updated_at = ?
WHERE url = ?
''', (
product_info.get('name'),
product_info.get('introduction'),
product_info.get('user_count'),
product_info.get('maker_link'),
product_info.get('maker_statement'),
current_time,
product_info['url']
))
logger.info(f"更新产品信息: {product_info.get('name', '未知')}")
else:
# 插入新记录
cursor.execute('''
INSERT INTO products
(url, name, introduction, user_count, maker_link, maker_statement, created_at, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
''', (
product_info['url'],
product_info.get('name'),
product_info.get('introduction'),
product_info.get('user_count'),
product_info.get('maker_link'),
product_info.get('maker_statement'),
current_time,
current_time
))
logger.info(f"新增产品信息: {product_info.get('name', '未知')}")
conn.commit()
conn.close()
return True
except Exception as e:
logger.error(f"保存产品信息失败: {e}")
return False
async def scrape_product_info(self, url):
"""使用Playwright抓取产品信息"""
try:
# 导入Playwright相关模块
from playwright.async_api import async_playwright
logger.info(f"开始抓取: {url}")
# 创建Playwright实例
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless=True)
page = await browser.new_page()
# 设置超时时间
page.set_default_timeout(120000) # 增加超时时间以处理Cloudflare挑战
# 导航到页面
await page.goto(url, wait_until="domcontentloaded")
# 检查是否是Cloudflare挑战页面
page_title = await page.title()
if "请稍候" in page_title or "Checking" in page_title or "Verifying" in page_title:
logger.info("检测到Cloudflare挑战页面等待验证完成...")
# 等待Cloudflare挑战完成
try:
# 等待页面标题变化或特定元素出现
await page.wait_for_function(
"""() => {
const title = document.title;
return !title.includes('请稍候') &&
!title.includes('Checking') &&
!title.includes('Verifying') &&
title !== '请稍候…';
}""",
timeout=300000 # 5分钟
)
logger.info("Cloudflare挑战已完成")
except Exception as e:
logger.warning(f"等待Cloudflare挑战超时: {e}")
# 等待页面加载
await page.wait_for_timeout(3000)
product_info = {'url': url}
# 提取产品名称 - 改进的XPath选择器
try:
# 尝试多种选择器
name_selectors = [
"xpath=//h1",
"xpath=//h1[@data-test='product-name']",
"xpath=//h1[contains(@class, 'text')]",
"xpath=//title"
]
for selector in name_selectors:
name_element = await page.query_selector(selector)
if name_element:
name_text = (await name_element.text_content()).strip()
# 过滤掉页面标题中的无关内容
if name_text and 'Product Hunt' not in name_text and len(name_text) > 5:
product_info['name'] = name_text
logger.info(f"提取到产品名称: {product_info['name']}")
break
if 'name' not in product_info:
logger.warning("未找到有效的产品名称元素")
except Exception as e:
logger.warning(f"提取产品名称失败: {e}")
# 提取产品简介 - 改进的XPath选择器
try:
intro_selectors = [
"xpath=//*[@class='relative text-16 font-normal text-gray-700']//div",
"xpath=//p[contains(@class, 'description')]",
"xpath=//div[contains(@class, 'description')]",
"xpath=//meta[@name='description']"
]
for selector in intro_selectors:
intro_element = await page.query_selector(selector)
if intro_element:
intro_text = (await intro_element.text_content()).strip()
if intro_text:
product_info['introduction'] = intro_text
logger.info(f"提取到产品简介: {product_info['introduction'][:100]}...")
break
if 'introduction' not in product_info:
logger.warning("未找到产品简介元素")
except Exception as e:
logger.warning(f"提取产品简介失败: {e}")
# 提取用户数 - 改进的XPath选择器
try:
user_count_selectors = [
"xpath=//*[@class='flex flex-row gap-2']//div/div[2]/span/p",
"xpath=//span[contains(text(), 'users')]",
"xpath=//span[contains(text(), 'upvotes')]",
"xpath=//div[contains(@class, 'stats')]"
]
for selector in user_count_selectors:
user_count_element = await page.query_selector(selector)
if user_count_element:
user_count_text = (await user_count_element.text_content()).strip()
if user_count_text:
product_info['user_count'] = user_count_text
logger.info(f"提取到用户数: {product_info['user_count']}")
break
if 'user_count' not in product_info:
logger.warning("未找到用户数元素")
except Exception as e:
logger.warning(f"提取用户数失败: {e}")
# 提取制作人链接 - 改进的XPath选择器
try:
maker_link_selectors = [
"xpath=//span[contains(@class, 'absolute')]",
"xpath=//a[contains(@href, 'hunter')]",
"xpath=//a[contains(text(), 'hunter')]",
"xpath=//a[contains(@class, 'maker')]"
]
for selector in maker_link_selectors:
maker_element = await page.query_selector(selector)
if maker_element:
# 如果是span找父级a标签
if 'span' in selector:
a_element = await maker_element.evaluate_handle('(element) => element.closest("a")')
if a_element:
maker_link = await a_element.get_attribute('href')
else:
maker_link = await maker_element.get_attribute('href')
if maker_link and not maker_link.startswith('http'):
base_url = "https://www.producthunt.com"
if maker_link.startswith('/'):
maker_link = base_url + maker_link
else:
maker_link = base_url + '/' + maker_link
if maker_link:
product_info['maker_link'] = maker_link
logger.info(f"提取到制作人链接: {maker_link}")
break
if 'maker_link' not in product_info:
logger.warning("未找到制作人链接元素")
except Exception as e:
logger.warning(f"提取制作人链接失败: {e}")
# 提取制作人发言(简化版本)
try:
if product_info.get('maker_link'):
# 在新页面中打开制作人链接
new_page = await browser.new_page()
await new_page.goto(product_info['maker_link'], wait_until="domcontentloaded")
await new_page.wait_for_timeout(5000)
# 尝试多种选择器提取发言内容
statement_selectors = [
"xpath=//*[@id='comment-4597755']/div/div[2]/div/div/div",
"xpath=//div[contains(@class, 'comment')]",
"xpath=//p[contains(@class, 'comment')]",
"xpath=//article"
]
for selector in statement_selectors:
comment_element = await new_page.query_selector(selector)
if comment_element:
statement_text = (await comment_element.text_content()).strip()
if statement_text and len(statement_text) > 10:
product_info['maker_statement'] = statement_text
logger.info(f"提取到制作人发言: {product_info['maker_statement'][:100]}...")
break
await new_page.close()
else:
logger.warning("没有制作人链接,跳过提取制作人发言")
except Exception as e:
logger.warning(f"提取制作人发言失败: {e}")
# 关闭浏览器
await browser.close()
await playwright.stop()
logger.success(f"抓取完成: {product_info.get('name', '未知')}")
return product_info
except Exception as e:
logger.error(f"抓取产品信息失败: {e}")
return {'url': url}
async def process_urls(self):
"""处理所有URL"""
# 查询URL
self.product_urls = self.query_producthunt_urls()
if not self.product_urls:
logger.warning("未找到包含producthunt.com的链接")
return
# 初始化数据库
self.init_product_database()
logger.info(f"开始处理 {len(self.product_urls)} 个产品链接")
# 创建进度条
with tqdm(total=len(self.product_urls), desc="处理进度") as pbar:
for url in self.product_urls:
try:
# 检查是否已存在
if self.check_duplicate(url):
logger.info(f"跳过已存在的链接: {url}")
pbar.update(1)
continue
# 抓取产品信息
product_info = await self.scrape_product_info(url)
# 保存到数据库
if product_info:
self.save_product_info(product_info)
pbar.update(1)
except Exception as e:
logger.error(f"处理链接失败 {url}: {e}")
pbar.update(1)
def run(self):
"""运行主程序"""
logger.info("开始ProductHunt数据抓取任务")
try:
# 运行异步任务
asyncio.run(self.process_urls())
logger.success("任务完成")
except Exception as e:
logger.error(f"程序执行失败: {e}")
def main():
"""主函数"""
scraper = ProductHuntScraper()
scraper.run()
if __name__ == "__main__":
main()