407 lines
17 KiB
Python
407 lines
17 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
"""
|
|||
|
|
ProductHunt数据抓取器
|
|||
|
|
从tophub_data.db查询包含producthunt.com的链接,然后使用Playwright抓取产品信息并保存到product.db
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import sqlite3
|
|||
|
|
import asyncio
|
|||
|
|
import os
|
|||
|
|
from datetime import datetime
|
|||
|
|
from loguru import logger
|
|||
|
|
from tqdm import tqdm
|
|||
|
|
import sys
|
|||
|
|
|
|||
|
|
# 配置日志
|
|||
|
|
logger.remove()
|
|||
|
|
logger.add(sys.stderr, level="INFO", format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>")
|
|||
|
|
|
|||
|
|
class ProductHuntScraper:
|
|||
|
|
"""ProductHunt数据抓取器"""
|
|||
|
|
|
|||
|
|
def __init__(self):
|
|||
|
|
self.tophub_db_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "tophub_data.db")
|
|||
|
|
self.product_db_path = os.path.join(os.path.dirname(__file__), "product.db")
|
|||
|
|
self.product_urls = []
|
|||
|
|
|
|||
|
|
def query_producthunt_urls(self):
|
|||
|
|
"""查询包含producthunt.com的链接"""
|
|||
|
|
logger.info("正在查询tophub_data.db数据库...")
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
conn = sqlite3.connect(self.tophub_db_path)
|
|||
|
|
cursor = conn.cursor()
|
|||
|
|
|
|||
|
|
# 查询包含producthunt.com的链接
|
|||
|
|
cursor.execute("SELECT url FROM articles WHERE url LIKE '%producthunt.com%'")
|
|||
|
|
urls = [row[0] for row in cursor.fetchall()]
|
|||
|
|
|
|||
|
|
conn.close()
|
|||
|
|
|
|||
|
|
logger.success(f"找到 {len(urls)} 个包含producthunt.com的链接")
|
|||
|
|
return urls
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"查询数据库失败: {e}")
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
def init_product_database(self):
|
|||
|
|
"""初始化product.db数据库"""
|
|||
|
|
logger.info("正在初始化product.db数据库...")
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
conn = sqlite3.connect(self.product_db_path)
|
|||
|
|
cursor = conn.cursor()
|
|||
|
|
|
|||
|
|
# 创建产品信息表
|
|||
|
|
cursor.execute('''
|
|||
|
|
CREATE TABLE IF NOT EXISTS products (
|
|||
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|||
|
|
url TEXT NOT NULL UNIQUE,
|
|||
|
|
name TEXT,
|
|||
|
|
introduction TEXT,
|
|||
|
|
user_count TEXT,
|
|||
|
|
maker_link TEXT,
|
|||
|
|
maker_statement TEXT,
|
|||
|
|
created_at TEXT NOT NULL,
|
|||
|
|
updated_at TEXT NOT NULL
|
|||
|
|
)
|
|||
|
|
''')
|
|||
|
|
|
|||
|
|
conn.commit()
|
|||
|
|
conn.close()
|
|||
|
|
logger.success("product.db数据库初始化完成")
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"初始化数据库失败: {e}")
|
|||
|
|
|
|||
|
|
def check_duplicate(self, url):
|
|||
|
|
"""检查URL是否已存在"""
|
|||
|
|
try:
|
|||
|
|
conn = sqlite3.connect(self.product_db_path)
|
|||
|
|
cursor = conn.cursor()
|
|||
|
|
|
|||
|
|
cursor.execute("SELECT COUNT(*) FROM products WHERE url = ?", (url,))
|
|||
|
|
count = cursor.fetchone()[0]
|
|||
|
|
|
|||
|
|
conn.close()
|
|||
|
|
return count > 0
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"检查重复失败: {e}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def save_product_info(self, product_info):
|
|||
|
|
"""保存产品信息到数据库"""
|
|||
|
|
try:
|
|||
|
|
conn = sqlite3.connect(self.product_db_path)
|
|||
|
|
cursor = conn.cursor()
|
|||
|
|
|
|||
|
|
current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
|||
|
|
|
|||
|
|
# 检查是否已存在
|
|||
|
|
cursor.execute("SELECT id FROM products WHERE url = ?", (product_info['url'],))
|
|||
|
|
existing = cursor.fetchone()
|
|||
|
|
|
|||
|
|
if existing:
|
|||
|
|
# 更新现有记录
|
|||
|
|
cursor.execute('''
|
|||
|
|
UPDATE products SET
|
|||
|
|
name = ?, introduction = ?, user_count = ?,
|
|||
|
|
maker_link = ?, maker_statement = ?, updated_at = ?
|
|||
|
|
WHERE url = ?
|
|||
|
|
''', (
|
|||
|
|
product_info.get('name'),
|
|||
|
|
product_info.get('introduction'),
|
|||
|
|
product_info.get('user_count'),
|
|||
|
|
product_info.get('maker_link'),
|
|||
|
|
product_info.get('maker_statement'),
|
|||
|
|
current_time,
|
|||
|
|
product_info['url']
|
|||
|
|
))
|
|||
|
|
logger.info(f"更新产品信息: {product_info.get('name', '未知')}")
|
|||
|
|
else:
|
|||
|
|
# 插入新记录
|
|||
|
|
cursor.execute('''
|
|||
|
|
INSERT INTO products
|
|||
|
|
(url, name, introduction, user_count, maker_link, maker_statement, created_at, updated_at)
|
|||
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|||
|
|
''', (
|
|||
|
|
product_info['url'],
|
|||
|
|
product_info.get('name'),
|
|||
|
|
product_info.get('introduction'),
|
|||
|
|
product_info.get('user_count'),
|
|||
|
|
product_info.get('maker_link'),
|
|||
|
|
product_info.get('maker_statement'),
|
|||
|
|
current_time,
|
|||
|
|
current_time
|
|||
|
|
))
|
|||
|
|
logger.info(f"新增产品信息: {product_info.get('name', '未知')}")
|
|||
|
|
|
|||
|
|
conn.commit()
|
|||
|
|
conn.close()
|
|||
|
|
return True
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"保存产品信息失败: {e}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
async def scrape_product_info(self, url):
|
|||
|
|
"""使用Playwright抓取产品信息"""
|
|||
|
|
try:
|
|||
|
|
# 导入Playwright相关模块
|
|||
|
|
from playwright.async_api import async_playwright
|
|||
|
|
|
|||
|
|
logger.info(f"开始抓取: {url}")
|
|||
|
|
|
|||
|
|
# 创建Playwright实例
|
|||
|
|
playwright = await async_playwright().start()
|
|||
|
|
browser = await playwright.chromium.launch(headless=True)
|
|||
|
|
page = await browser.new_page()
|
|||
|
|
|
|||
|
|
# 设置超时时间
|
|||
|
|
page.set_default_timeout(120000) # 增加超时时间以处理Cloudflare挑战
|
|||
|
|
|
|||
|
|
# 导航到页面
|
|||
|
|
await page.goto(url, wait_until="domcontentloaded")
|
|||
|
|
|
|||
|
|
# 检查是否是Cloudflare挑战页面
|
|||
|
|
page_title = await page.title()
|
|||
|
|
if "请稍候" in page_title or "Checking" in page_title or "Verifying" in page_title:
|
|||
|
|
logger.info("检测到Cloudflare挑战页面,等待验证完成...")
|
|||
|
|
|
|||
|
|
# 等待Cloudflare挑战完成
|
|||
|
|
try:
|
|||
|
|
# 等待页面标题变化或特定元素出现
|
|||
|
|
await page.wait_for_function(
|
|||
|
|
"""() => {
|
|||
|
|
const title = document.title;
|
|||
|
|
return !title.includes('请稍候') &&
|
|||
|
|
!title.includes('Checking') &&
|
|||
|
|
!title.includes('Verifying') &&
|
|||
|
|
title !== '请稍候…';
|
|||
|
|
}""",
|
|||
|
|
timeout=300000 # 5分钟
|
|||
|
|
)
|
|||
|
|
logger.info("Cloudflare挑战已完成")
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning(f"等待Cloudflare挑战超时: {e}")
|
|||
|
|
|
|||
|
|
# 等待页面加载
|
|||
|
|
await page.wait_for_timeout(3000)
|
|||
|
|
|
|||
|
|
product_info = {'url': url}
|
|||
|
|
|
|||
|
|
# 提取产品名称 - 改进的XPath选择器
|
|||
|
|
try:
|
|||
|
|
# 尝试多种选择器
|
|||
|
|
name_selectors = [
|
|||
|
|
"xpath=//h1",
|
|||
|
|
"xpath=//h1[@data-test='product-name']",
|
|||
|
|
"xpath=//h1[contains(@class, 'text')]",
|
|||
|
|
"xpath=//title"
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
for selector in name_selectors:
|
|||
|
|
name_element = await page.query_selector(selector)
|
|||
|
|
if name_element:
|
|||
|
|
name_text = (await name_element.text_content()).strip()
|
|||
|
|
# 过滤掉页面标题中的无关内容
|
|||
|
|
if name_text and 'Product Hunt' not in name_text and len(name_text) > 5:
|
|||
|
|
product_info['name'] = name_text
|
|||
|
|
logger.info(f"提取到产品名称: {product_info['name']}")
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
if 'name' not in product_info:
|
|||
|
|
logger.warning("未找到有效的产品名称元素")
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning(f"提取产品名称失败: {e}")
|
|||
|
|
|
|||
|
|
# 提取产品简介 - 改进的XPath选择器
|
|||
|
|
try:
|
|||
|
|
intro_selectors = [
|
|||
|
|
"xpath=//*[@class='relative text-16 font-normal text-gray-700']//div",
|
|||
|
|
"xpath=//p[contains(@class, 'description')]",
|
|||
|
|
"xpath=//div[contains(@class, 'description')]",
|
|||
|
|
"xpath=//meta[@name='description']"
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
for selector in intro_selectors:
|
|||
|
|
intro_element = await page.query_selector(selector)
|
|||
|
|
if intro_element:
|
|||
|
|
intro_text = (await intro_element.text_content()).strip()
|
|||
|
|
if intro_text:
|
|||
|
|
product_info['introduction'] = intro_text
|
|||
|
|
logger.info(f"提取到产品简介: {product_info['introduction'][:100]}...")
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
if 'introduction' not in product_info:
|
|||
|
|
logger.warning("未找到产品简介元素")
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning(f"提取产品简介失败: {e}")
|
|||
|
|
|
|||
|
|
# 提取用户数 - 改进的XPath选择器
|
|||
|
|
try:
|
|||
|
|
user_count_selectors = [
|
|||
|
|
"xpath=//*[@class='flex flex-row gap-2']//div/div[2]/span/p",
|
|||
|
|
"xpath=//span[contains(text(), 'users')]",
|
|||
|
|
"xpath=//span[contains(text(), 'upvotes')]",
|
|||
|
|
"xpath=//div[contains(@class, 'stats')]"
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
for selector in user_count_selectors:
|
|||
|
|
user_count_element = await page.query_selector(selector)
|
|||
|
|
if user_count_element:
|
|||
|
|
user_count_text = (await user_count_element.text_content()).strip()
|
|||
|
|
if user_count_text:
|
|||
|
|
product_info['user_count'] = user_count_text
|
|||
|
|
logger.info(f"提取到用户数: {product_info['user_count']}")
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
if 'user_count' not in product_info:
|
|||
|
|
logger.warning("未找到用户数元素")
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning(f"提取用户数失败: {e}")
|
|||
|
|
|
|||
|
|
# 提取制作人链接 - 改进的XPath选择器
|
|||
|
|
try:
|
|||
|
|
maker_link_selectors = [
|
|||
|
|
"xpath=//span[contains(@class, 'absolute')]",
|
|||
|
|
"xpath=//a[contains(@href, 'hunter')]",
|
|||
|
|
"xpath=//a[contains(text(), 'hunter')]",
|
|||
|
|
"xpath=//a[contains(@class, 'maker')]"
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
for selector in maker_link_selectors:
|
|||
|
|
maker_element = await page.query_selector(selector)
|
|||
|
|
if maker_element:
|
|||
|
|
# 如果是span,找父级a标签
|
|||
|
|
if 'span' in selector:
|
|||
|
|
a_element = await maker_element.evaluate_handle('(element) => element.closest("a")')
|
|||
|
|
if a_element:
|
|||
|
|
maker_link = await a_element.get_attribute('href')
|
|||
|
|
else:
|
|||
|
|
maker_link = await maker_element.get_attribute('href')
|
|||
|
|
|
|||
|
|
if maker_link and not maker_link.startswith('http'):
|
|||
|
|
base_url = "https://www.producthunt.com"
|
|||
|
|
if maker_link.startswith('/'):
|
|||
|
|
maker_link = base_url + maker_link
|
|||
|
|
else:
|
|||
|
|
maker_link = base_url + '/' + maker_link
|
|||
|
|
|
|||
|
|
if maker_link:
|
|||
|
|
product_info['maker_link'] = maker_link
|
|||
|
|
logger.info(f"提取到制作人链接: {maker_link}")
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
if 'maker_link' not in product_info:
|
|||
|
|
logger.warning("未找到制作人链接元素")
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning(f"提取制作人链接失败: {e}")
|
|||
|
|
|
|||
|
|
# 提取制作人发言(简化版本)
|
|||
|
|
try:
|
|||
|
|
if product_info.get('maker_link'):
|
|||
|
|
# 在新页面中打开制作人链接
|
|||
|
|
new_page = await browser.new_page()
|
|||
|
|
await new_page.goto(product_info['maker_link'], wait_until="domcontentloaded")
|
|||
|
|
await new_page.wait_for_timeout(5000)
|
|||
|
|
|
|||
|
|
# 尝试多种选择器提取发言内容
|
|||
|
|
statement_selectors = [
|
|||
|
|
"xpath=//*[@id='comment-4597755']/div/div[2]/div/div/div",
|
|||
|
|
"xpath=//div[contains(@class, 'comment')]",
|
|||
|
|
"xpath=//p[contains(@class, 'comment')]",
|
|||
|
|
"xpath=//article"
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
for selector in statement_selectors:
|
|||
|
|
comment_element = await new_page.query_selector(selector)
|
|||
|
|
if comment_element:
|
|||
|
|
statement_text = (await comment_element.text_content()).strip()
|
|||
|
|
if statement_text and len(statement_text) > 10:
|
|||
|
|
product_info['maker_statement'] = statement_text
|
|||
|
|
logger.info(f"提取到制作人发言: {product_info['maker_statement'][:100]}...")
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
await new_page.close()
|
|||
|
|
else:
|
|||
|
|
logger.warning("没有制作人链接,跳过提取制作人发言")
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning(f"提取制作人发言失败: {e}")
|
|||
|
|
|
|||
|
|
# 关闭浏览器
|
|||
|
|
await browser.close()
|
|||
|
|
await playwright.stop()
|
|||
|
|
|
|||
|
|
logger.success(f"抓取完成: {product_info.get('name', '未知')}")
|
|||
|
|
return product_info
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"抓取产品信息失败: {e}")
|
|||
|
|
return {'url': url}
|
|||
|
|
|
|||
|
|
async def process_urls(self):
|
|||
|
|
"""处理所有URL"""
|
|||
|
|
# 查询URL
|
|||
|
|
self.product_urls = self.query_producthunt_urls()
|
|||
|
|
|
|||
|
|
if not self.product_urls:
|
|||
|
|
logger.warning("未找到包含producthunt.com的链接")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
# 初始化数据库
|
|||
|
|
self.init_product_database()
|
|||
|
|
|
|||
|
|
logger.info(f"开始处理 {len(self.product_urls)} 个产品链接")
|
|||
|
|
|
|||
|
|
# 创建进度条
|
|||
|
|
with tqdm(total=len(self.product_urls), desc="处理进度") as pbar:
|
|||
|
|
for url in self.product_urls:
|
|||
|
|
try:
|
|||
|
|
# 检查是否已存在
|
|||
|
|
if self.check_duplicate(url):
|
|||
|
|
logger.info(f"跳过已存在的链接: {url}")
|
|||
|
|
pbar.update(1)
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 抓取产品信息
|
|||
|
|
product_info = await self.scrape_product_info(url)
|
|||
|
|
|
|||
|
|
# 保存到数据库
|
|||
|
|
if product_info:
|
|||
|
|
self.save_product_info(product_info)
|
|||
|
|
|
|||
|
|
pbar.update(1)
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"处理链接失败 {url}: {e}")
|
|||
|
|
pbar.update(1)
|
|||
|
|
|
|||
|
|
def run(self):
|
|||
|
|
"""运行主程序"""
|
|||
|
|
logger.info("开始ProductHunt数据抓取任务")
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# 运行异步任务
|
|||
|
|
asyncio.run(self.process_urls())
|
|||
|
|
logger.success("任务完成")
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"程序执行失败: {e}")
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
"""主函数"""
|
|||
|
|
scraper = ProductHuntScraper()
|
|||
|
|
scraper.run()
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|