Files
tophux_scrape/product/advanced_scraper.py

312 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
高级ProductHunt抓取器 - 处理Cloudflare Turnstile挑战
"""
import asyncio
import sqlite3
from loguru import logger
import os
from urllib.parse import urlparse
class AdvancedProductHuntScraper:
def __init__(self, db_path="test_product.db"):
self.db_path = db_path
self.init_database()
def init_database(self):
"""初始化数据库"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
# 创建products表
cursor.execute("""
CREATE TABLE IF NOT EXISTS products (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT,
url TEXT UNIQUE,
introduction TEXT,
user_count INTEGER,
maker_link TEXT,
maker_statement TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
conn.commit()
conn.close()
logger.info(f"数据库已初始化: {self.db_path}")
def check_duplicate(self, url):
"""检查URL是否已存在"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute("SELECT id FROM products WHERE url = ?", (url,))
result = cursor.fetchone()
conn.close()
return result is not None
def save_product_info(self, product_info):
"""保存产品信息到数据库"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
# 检查是否已存在
cursor.execute("SELECT id FROM products WHERE url = ?", (product_info['url'],))
existing = cursor.fetchone()
if existing:
# 更新现有记录
cursor.execute("""
UPDATE products SET
name = ?, introduction = ?, user_count = ?,
maker_link = ?, maker_statement = ?, updated_at = CURRENT_TIMESTAMP
WHERE url = ?
""", (
product_info['name'], product_info['introduction'],
product_info['user_count'], product_info['maker_link'],
product_info['maker_statement'], product_info['url']
))
logger.info(f"更新产品信息: {product_info['name']}")
else:
# 插入新记录
cursor.execute("""
INSERT INTO products (name, url, introduction, user_count, maker_link, maker_statement)
VALUES (?, ?, ?, ?, ?, ?)
""", (
product_info['name'], product_info['url'], product_info['introduction'],
product_info['user_count'], product_info['maker_link'], product_info['maker_statement']
))
logger.info(f"保存产品信息: {product_info['name']}")
conn.commit()
conn.close()
async def scrape_with_stealth(self, url):
"""使用隐身模式抓取产品信息"""
try:
from playwright.async_api import async_playwright
logger.info(f"开始高级抓取: {url}")
# 创建Playwright实例
playwright = await async_playwright().start()
# 使用更隐蔽的浏览器配置
browser = await playwright.chromium.launch(
headless=False, # 非无头模式以便观察
args=[
'--disable-blink-features=AutomationControlled',
'--disable-features=VizDisplayCompositor',
'--disable-background-timer-throttling',
'--disable-backgrounding-occluded-windows',
'--disable-renderer-backgrounding',
'--disable-web-security',
'--disable-features=TranslateUI',
'--disable-ipc-flooding-protection',
'--no-sandbox',
'--disable-setuid-sandbox'
]
)
# 创建上下文和页面
context = await browser.new_context(
viewport={'width': 1920, 'height': 1080},
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
extra_http_headers={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}
)
page = await context.new_page()
# 隐藏自动化特征
await page.add_init_script("""
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined,
});
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5],
});
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en'],
});
""")
# 设置超时时间
page.set_default_timeout(300000) # 5分钟
# 导航到页面
await page.goto(url, wait_until="domcontentloaded")
# 检查页面状态
page_title = await page.title()
logger.info(f"页面标题: {page_title}")
# 检查是否是Cloudflare挑战页面
if "请稍候" in page_title or "Checking" in page_title or "Verifying" in page_title:
logger.info("检测到Cloudflare挑战页面等待用户手动验证...")
# 等待用户手动完成验证
try:
# 等待页面标题变化或特定元素出现
await page.wait_for_function(
"""() => {
const title = document.title;
return !title.includes('请稍候') &&
!title.includes('Checking') &&
!title.includes('Verifying') &&
title !== '请稍候…';
}""",
timeout=300000 # 5分钟
)
logger.info("Cloudflare挑战已完成")
except Exception as e:
logger.warning(f"等待Cloudflare挑战超时: {e}")
# 如果超时,尝试刷新页面
await page.reload(wait_until="domcontentloaded")
logger.info("已刷新页面")
# 等待页面加载
await page.wait_for_timeout(5000)
# 获取当前页面URL
current_url = page.url
logger.info(f"当前页面URL: {current_url}")
# 检查是否重定向到其他页面
if current_url != url:
logger.warning(f"页面已重定向: {url} -> {current_url}")
# 尝试提取产品信息
product_info = {'url': url}
# 提取产品名称
name_selectors = [
"h1",
"[data-test='product-name']",
".product-name",
"title"
]
for selector in name_selectors:
try:
element = await page.query_selector(selector)
if element:
name = await element.text_content()
if name and name.strip() and name.strip() != "www.producthunt.com":
product_info['name'] = name.strip()
logger.info(f"提取到产品名称: {product_info['name']}")
break
except Exception as e:
logger.debug(f"选择器 {selector} 失败: {e}")
if 'name' not in product_info:
# 从URL中提取产品名称
parsed_url = urlparse(url)
path_parts = parsed_url.path.split('/')
if len(path_parts) >= 3 and path_parts[-2] == 'products':
product_info['name'] = path_parts[-1].replace('-', ' ').title()
logger.info(f"从URL提取产品名称: {product_info['name']}")
else:
product_info['name'] = "Unknown Product"
logger.warning("无法提取产品名称")
# 提取其他信息(简化版本)
product_info['introduction'] = None
product_info['user_count'] = None
product_info['maker_link'] = None
product_info['maker_statement'] = None
# 关闭浏览器
await browser.close()
await playwright.stop()
logger.success(f"抓取完成: {product_info['name']}")
return product_info
except Exception as e:
logger.error(f"抓取失败: {e}")
return {'url': url, 'name': 'Error', 'introduction': None, 'user_count': None, 'maker_link': None, 'maker_statement': None}
async def run_test(self):
"""运行测试"""
# 从tophub_data.db获取ProductHunt链接
tophub_db_path = os.path.join(os.path.dirname(self.db_path), "..", "tophub_data.db")
conn = sqlite3.connect(tophub_db_path)
cursor = conn.cursor()
# 查询包含producthunt.com的链接
cursor.execute("""
SELECT url FROM articles
WHERE url LIKE '%producthunt.com%'
LIMIT 3
""")
urls = [row[0] for row in cursor.fetchall()]
conn.close()
logger.info(f"找到 {len(urls)} 个ProductHunt链接")
# 处理每个URL
for url in urls:
logger.info(f"处理URL: {url}")
# 检查是否重复(注释掉跳过逻辑以强制重新抓取)
# if self.check_duplicate(url):
# logger.info(f"链接已存在,跳过: {url}")
# continue
# 抓取产品信息
product_info = await self.scrape_with_stealth(url)
# 保存到数据库
self.save_product_info(product_info)
# 统计结果
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM products")
count = cursor.fetchone()[0]
cursor.execute("SELECT name, url FROM products")
products = cursor.fetchall()
conn.close()
logger.success("测试任务完成")
print("\n=== 测试结果统计 ===")
print(f"数据库中的产品数量: {count}")
print("已抓取的产品:")
for name, url in products:
print(f" - {name}: {url}")
async def main():
"""主函数"""
# 配置日志
logger.remove()
logger.add(
"advanced_scraper.log",
level="DEBUG",
format="{time:YYYY-MM-DD HH:mm:ss} | {level:<8} | {name}:{function}:{line} - {message}",
rotation="10 MB",
retention="7 days"
)
# 创建抓取器实例
scraper = AdvancedProductHuntScraper()
# 运行测试
await scraper.run_test()
if __name__ == "__main__":
asyncio.run(main())