407 lines
17 KiB
Python
407 lines
17 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
ProductHunt数据抓取器
|
||
从tophub_data.db查询包含producthunt.com的链接,然后使用Playwright抓取产品信息并保存到product.db
|
||
"""
|
||
|
||
import sqlite3
|
||
import asyncio
|
||
import os
|
||
from datetime import datetime
|
||
from loguru import logger
|
||
from tqdm import tqdm
|
||
import sys
|
||
|
||
# 配置日志
|
||
logger.remove()
|
||
logger.add(sys.stderr, level="INFO", format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>")
|
||
|
||
class ProductHuntScraper:
|
||
"""ProductHunt数据抓取器"""
|
||
|
||
def __init__(self):
|
||
self.tophub_db_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "tophub_data.db")
|
||
self.product_db_path = os.path.join(os.path.dirname(__file__), "product.db")
|
||
self.product_urls = []
|
||
|
||
def query_producthunt_urls(self):
|
||
"""查询包含producthunt.com的链接"""
|
||
logger.info("正在查询tophub_data.db数据库...")
|
||
|
||
try:
|
||
conn = sqlite3.connect(self.tophub_db_path)
|
||
cursor = conn.cursor()
|
||
|
||
# 查询包含producthunt.com的链接
|
||
cursor.execute("SELECT url FROM articles WHERE url LIKE '%producthunt.com%'")
|
||
urls = [row[0] for row in cursor.fetchall()]
|
||
|
||
conn.close()
|
||
|
||
logger.success(f"找到 {len(urls)} 个包含producthunt.com的链接")
|
||
return urls
|
||
|
||
except Exception as e:
|
||
logger.error(f"查询数据库失败: {e}")
|
||
return []
|
||
|
||
def init_product_database(self):
|
||
"""初始化product.db数据库"""
|
||
logger.info("正在初始化product.db数据库...")
|
||
|
||
try:
|
||
conn = sqlite3.connect(self.product_db_path)
|
||
cursor = conn.cursor()
|
||
|
||
# 创建产品信息表
|
||
cursor.execute('''
|
||
CREATE TABLE IF NOT EXISTS products (
|
||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||
url TEXT NOT NULL UNIQUE,
|
||
name TEXT,
|
||
introduction TEXT,
|
||
user_count TEXT,
|
||
maker_link TEXT,
|
||
maker_statement TEXT,
|
||
created_at TEXT NOT NULL,
|
||
updated_at TEXT NOT NULL
|
||
)
|
||
''')
|
||
|
||
conn.commit()
|
||
conn.close()
|
||
logger.success("product.db数据库初始化完成")
|
||
|
||
except Exception as e:
|
||
logger.error(f"初始化数据库失败: {e}")
|
||
|
||
def check_duplicate(self, url):
|
||
"""检查URL是否已存在"""
|
||
try:
|
||
conn = sqlite3.connect(self.product_db_path)
|
||
cursor = conn.cursor()
|
||
|
||
cursor.execute("SELECT COUNT(*) FROM products WHERE url = ?", (url,))
|
||
count = cursor.fetchone()[0]
|
||
|
||
conn.close()
|
||
return count > 0
|
||
|
||
except Exception as e:
|
||
logger.error(f"检查重复失败: {e}")
|
||
return False
|
||
|
||
def save_product_info(self, product_info):
|
||
"""保存产品信息到数据库"""
|
||
try:
|
||
conn = sqlite3.connect(self.product_db_path)
|
||
cursor = conn.cursor()
|
||
|
||
current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
||
|
||
# 检查是否已存在
|
||
cursor.execute("SELECT id FROM products WHERE url = ?", (product_info['url'],))
|
||
existing = cursor.fetchone()
|
||
|
||
if existing:
|
||
# 更新现有记录
|
||
cursor.execute('''
|
||
UPDATE products SET
|
||
name = ?, introduction = ?, user_count = ?,
|
||
maker_link = ?, maker_statement = ?, updated_at = ?
|
||
WHERE url = ?
|
||
''', (
|
||
product_info.get('name'),
|
||
product_info.get('introduction'),
|
||
product_info.get('user_count'),
|
||
product_info.get('maker_link'),
|
||
product_info.get('maker_statement'),
|
||
current_time,
|
||
product_info['url']
|
||
))
|
||
logger.info(f"更新产品信息: {product_info.get('name', '未知')}")
|
||
else:
|
||
# 插入新记录
|
||
cursor.execute('''
|
||
INSERT INTO products
|
||
(url, name, introduction, user_count, maker_link, maker_statement, created_at, updated_at)
|
||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||
''', (
|
||
product_info['url'],
|
||
product_info.get('name'),
|
||
product_info.get('introduction'),
|
||
product_info.get('user_count'),
|
||
product_info.get('maker_link'),
|
||
product_info.get('maker_statement'),
|
||
current_time,
|
||
current_time
|
||
))
|
||
logger.info(f"新增产品信息: {product_info.get('name', '未知')}")
|
||
|
||
conn.commit()
|
||
conn.close()
|
||
return True
|
||
|
||
except Exception as e:
|
||
logger.error(f"保存产品信息失败: {e}")
|
||
return False
|
||
|
||
async def scrape_product_info(self, url):
|
||
"""使用Playwright抓取产品信息"""
|
||
try:
|
||
# 导入Playwright相关模块
|
||
from playwright.async_api import async_playwright
|
||
|
||
logger.info(f"开始抓取: {url}")
|
||
|
||
# 创建Playwright实例
|
||
playwright = await async_playwright().start()
|
||
browser = await playwright.chromium.launch(headless=True)
|
||
page = await browser.new_page()
|
||
|
||
# 设置超时时间
|
||
page.set_default_timeout(120000) # 增加超时时间以处理Cloudflare挑战
|
||
|
||
# 导航到页面
|
||
await page.goto(url, wait_until="domcontentloaded")
|
||
|
||
# 检查是否是Cloudflare挑战页面
|
||
page_title = await page.title()
|
||
if "请稍候" in page_title or "Checking" in page_title or "Verifying" in page_title:
|
||
logger.info("检测到Cloudflare挑战页面,等待验证完成...")
|
||
|
||
# 等待Cloudflare挑战完成
|
||
try:
|
||
# 等待页面标题变化或特定元素出现
|
||
await page.wait_for_function(
|
||
"""() => {
|
||
const title = document.title;
|
||
return !title.includes('请稍候') &&
|
||
!title.includes('Checking') &&
|
||
!title.includes('Verifying') &&
|
||
title !== '请稍候…';
|
||
}""",
|
||
timeout=300000 # 5分钟
|
||
)
|
||
logger.info("Cloudflare挑战已完成")
|
||
except Exception as e:
|
||
logger.warning(f"等待Cloudflare挑战超时: {e}")
|
||
|
||
# 等待页面加载
|
||
await page.wait_for_timeout(3000)
|
||
|
||
product_info = {'url': url}
|
||
|
||
# 提取产品名称 - 改进的XPath选择器
|
||
try:
|
||
# 尝试多种选择器
|
||
name_selectors = [
|
||
"xpath=//h1",
|
||
"xpath=//h1[@data-test='product-name']",
|
||
"xpath=//h1[contains(@class, 'text')]",
|
||
"xpath=//title"
|
||
]
|
||
|
||
for selector in name_selectors:
|
||
name_element = await page.query_selector(selector)
|
||
if name_element:
|
||
name_text = (await name_element.text_content()).strip()
|
||
# 过滤掉页面标题中的无关内容
|
||
if name_text and 'Product Hunt' not in name_text and len(name_text) > 5:
|
||
product_info['name'] = name_text
|
||
logger.info(f"提取到产品名称: {product_info['name']}")
|
||
break
|
||
|
||
if 'name' not in product_info:
|
||
logger.warning("未找到有效的产品名称元素")
|
||
|
||
except Exception as e:
|
||
logger.warning(f"提取产品名称失败: {e}")
|
||
|
||
# 提取产品简介 - 改进的XPath选择器
|
||
try:
|
||
intro_selectors = [
|
||
"xpath=//*[@class='relative text-16 font-normal text-gray-700']//div",
|
||
"xpath=//p[contains(@class, 'description')]",
|
||
"xpath=//div[contains(@class, 'description')]",
|
||
"xpath=//meta[@name='description']"
|
||
]
|
||
|
||
for selector in intro_selectors:
|
||
intro_element = await page.query_selector(selector)
|
||
if intro_element:
|
||
intro_text = (await intro_element.text_content()).strip()
|
||
if intro_text:
|
||
product_info['introduction'] = intro_text
|
||
logger.info(f"提取到产品简介: {product_info['introduction'][:100]}...")
|
||
break
|
||
|
||
if 'introduction' not in product_info:
|
||
logger.warning("未找到产品简介元素")
|
||
|
||
except Exception as e:
|
||
logger.warning(f"提取产品简介失败: {e}")
|
||
|
||
# 提取用户数 - 改进的XPath选择器
|
||
try:
|
||
user_count_selectors = [
|
||
"xpath=//*[@class='flex flex-row gap-2']//div/div[2]/span/p",
|
||
"xpath=//span[contains(text(), 'users')]",
|
||
"xpath=//span[contains(text(), 'upvotes')]",
|
||
"xpath=//div[contains(@class, 'stats')]"
|
||
]
|
||
|
||
for selector in user_count_selectors:
|
||
user_count_element = await page.query_selector(selector)
|
||
if user_count_element:
|
||
user_count_text = (await user_count_element.text_content()).strip()
|
||
if user_count_text:
|
||
product_info['user_count'] = user_count_text
|
||
logger.info(f"提取到用户数: {product_info['user_count']}")
|
||
break
|
||
|
||
if 'user_count' not in product_info:
|
||
logger.warning("未找到用户数元素")
|
||
|
||
except Exception as e:
|
||
logger.warning(f"提取用户数失败: {e}")
|
||
|
||
# 提取制作人链接 - 改进的XPath选择器
|
||
try:
|
||
maker_link_selectors = [
|
||
"xpath=//span[contains(@class, 'absolute')]",
|
||
"xpath=//a[contains(@href, 'hunter')]",
|
||
"xpath=//a[contains(text(), 'hunter')]",
|
||
"xpath=//a[contains(@class, 'maker')]"
|
||
]
|
||
|
||
for selector in maker_link_selectors:
|
||
maker_element = await page.query_selector(selector)
|
||
if maker_element:
|
||
# 如果是span,找父级a标签
|
||
if 'span' in selector:
|
||
a_element = await maker_element.evaluate_handle('(element) => element.closest("a")')
|
||
if a_element:
|
||
maker_link = await a_element.get_attribute('href')
|
||
else:
|
||
maker_link = await maker_element.get_attribute('href')
|
||
|
||
if maker_link and not maker_link.startswith('http'):
|
||
base_url = "https://www.producthunt.com"
|
||
if maker_link.startswith('/'):
|
||
maker_link = base_url + maker_link
|
||
else:
|
||
maker_link = base_url + '/' + maker_link
|
||
|
||
if maker_link:
|
||
product_info['maker_link'] = maker_link
|
||
logger.info(f"提取到制作人链接: {maker_link}")
|
||
break
|
||
|
||
if 'maker_link' not in product_info:
|
||
logger.warning("未找到制作人链接元素")
|
||
|
||
except Exception as e:
|
||
logger.warning(f"提取制作人链接失败: {e}")
|
||
|
||
# 提取制作人发言(简化版本)
|
||
try:
|
||
if product_info.get('maker_link'):
|
||
# 在新页面中打开制作人链接
|
||
new_page = await browser.new_page()
|
||
await new_page.goto(product_info['maker_link'], wait_until="domcontentloaded")
|
||
await new_page.wait_for_timeout(5000)
|
||
|
||
# 尝试多种选择器提取发言内容
|
||
statement_selectors = [
|
||
"xpath=//*[@id='comment-4597755']/div/div[2]/div/div/div",
|
||
"xpath=//div[contains(@class, 'comment')]",
|
||
"xpath=//p[contains(@class, 'comment')]",
|
||
"xpath=//article"
|
||
]
|
||
|
||
for selector in statement_selectors:
|
||
comment_element = await new_page.query_selector(selector)
|
||
if comment_element:
|
||
statement_text = (await comment_element.text_content()).strip()
|
||
if statement_text and len(statement_text) > 10:
|
||
product_info['maker_statement'] = statement_text
|
||
logger.info(f"提取到制作人发言: {product_info['maker_statement'][:100]}...")
|
||
break
|
||
|
||
await new_page.close()
|
||
else:
|
||
logger.warning("没有制作人链接,跳过提取制作人发言")
|
||
except Exception as e:
|
||
logger.warning(f"提取制作人发言失败: {e}")
|
||
|
||
# 关闭浏览器
|
||
await browser.close()
|
||
await playwright.stop()
|
||
|
||
logger.success(f"抓取完成: {product_info.get('name', '未知')}")
|
||
return product_info
|
||
|
||
except Exception as e:
|
||
logger.error(f"抓取产品信息失败: {e}")
|
||
return {'url': url}
|
||
|
||
async def process_urls(self):
|
||
"""处理所有URL"""
|
||
# 查询URL
|
||
self.product_urls = self.query_producthunt_urls()
|
||
|
||
if not self.product_urls:
|
||
logger.warning("未找到包含producthunt.com的链接")
|
||
return
|
||
|
||
# 初始化数据库
|
||
self.init_product_database()
|
||
|
||
logger.info(f"开始处理 {len(self.product_urls)} 个产品链接")
|
||
|
||
# 创建进度条
|
||
with tqdm(total=len(self.product_urls), desc="处理进度") as pbar:
|
||
for url in self.product_urls:
|
||
try:
|
||
# 检查是否已存在
|
||
if self.check_duplicate(url):
|
||
logger.info(f"跳过已存在的链接: {url}")
|
||
pbar.update(1)
|
||
continue
|
||
|
||
# 抓取产品信息
|
||
product_info = await self.scrape_product_info(url)
|
||
|
||
# 保存到数据库
|
||
if product_info:
|
||
self.save_product_info(product_info)
|
||
|
||
pbar.update(1)
|
||
|
||
except Exception as e:
|
||
logger.error(f"处理链接失败 {url}: {e}")
|
||
pbar.update(1)
|
||
|
||
def run(self):
|
||
"""运行主程序"""
|
||
logger.info("开始ProductHunt数据抓取任务")
|
||
|
||
try:
|
||
# 运行异步任务
|
||
asyncio.run(self.process_urls())
|
||
logger.success("任务完成")
|
||
|
||
except Exception as e:
|
||
logger.error(f"程序执行失败: {e}")
|
||
|
||
|
||
def main():
|
||
"""主函数"""
|
||
scraper = ProductHuntScraper()
|
||
scraper.run()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main() |