312 lines
12 KiB
Python
312 lines
12 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
高级ProductHunt抓取器 - 处理Cloudflare Turnstile挑战
|
||
"""
|
||
|
||
import asyncio
|
||
import sqlite3
|
||
from loguru import logger
|
||
import os
|
||
from urllib.parse import urlparse
|
||
|
||
class AdvancedProductHuntScraper:
|
||
def __init__(self, db_path="test_product.db"):
|
||
self.db_path = db_path
|
||
self.init_database()
|
||
|
||
def init_database(self):
|
||
"""初始化数据库"""
|
||
conn = sqlite3.connect(self.db_path)
|
||
cursor = conn.cursor()
|
||
|
||
# 创建products表
|
||
cursor.execute("""
|
||
CREATE TABLE IF NOT EXISTS products (
|
||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||
name TEXT,
|
||
url TEXT UNIQUE,
|
||
introduction TEXT,
|
||
user_count INTEGER,
|
||
maker_link TEXT,
|
||
maker_statement TEXT,
|
||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||
)
|
||
""")
|
||
|
||
conn.commit()
|
||
conn.close()
|
||
logger.info(f"数据库已初始化: {self.db_path}")
|
||
|
||
def check_duplicate(self, url):
|
||
"""检查URL是否已存在"""
|
||
conn = sqlite3.connect(self.db_path)
|
||
cursor = conn.cursor()
|
||
cursor.execute("SELECT id FROM products WHERE url = ?", (url,))
|
||
result = cursor.fetchone()
|
||
conn.close()
|
||
return result is not None
|
||
|
||
def save_product_info(self, product_info):
|
||
"""保存产品信息到数据库"""
|
||
conn = sqlite3.connect(self.db_path)
|
||
cursor = conn.cursor()
|
||
|
||
# 检查是否已存在
|
||
cursor.execute("SELECT id FROM products WHERE url = ?", (product_info['url'],))
|
||
existing = cursor.fetchone()
|
||
|
||
if existing:
|
||
# 更新现有记录
|
||
cursor.execute("""
|
||
UPDATE products SET
|
||
name = ?, introduction = ?, user_count = ?,
|
||
maker_link = ?, maker_statement = ?, updated_at = CURRENT_TIMESTAMP
|
||
WHERE url = ?
|
||
""", (
|
||
product_info['name'], product_info['introduction'],
|
||
product_info['user_count'], product_info['maker_link'],
|
||
product_info['maker_statement'], product_info['url']
|
||
))
|
||
logger.info(f"更新产品信息: {product_info['name']}")
|
||
else:
|
||
# 插入新记录
|
||
cursor.execute("""
|
||
INSERT INTO products (name, url, introduction, user_count, maker_link, maker_statement)
|
||
VALUES (?, ?, ?, ?, ?, ?)
|
||
""", (
|
||
product_info['name'], product_info['url'], product_info['introduction'],
|
||
product_info['user_count'], product_info['maker_link'], product_info['maker_statement']
|
||
))
|
||
logger.info(f"保存产品信息: {product_info['name']}")
|
||
|
||
conn.commit()
|
||
conn.close()
|
||
|
||
async def scrape_with_stealth(self, url):
|
||
"""使用隐身模式抓取产品信息"""
|
||
try:
|
||
from playwright.async_api import async_playwright
|
||
|
||
logger.info(f"开始高级抓取: {url}")
|
||
|
||
# 创建Playwright实例
|
||
playwright = await async_playwright().start()
|
||
|
||
# 使用更隐蔽的浏览器配置
|
||
browser = await playwright.chromium.launch(
|
||
headless=False, # 非无头模式以便观察
|
||
args=[
|
||
'--disable-blink-features=AutomationControlled',
|
||
'--disable-features=VizDisplayCompositor',
|
||
'--disable-background-timer-throttling',
|
||
'--disable-backgrounding-occluded-windows',
|
||
'--disable-renderer-backgrounding',
|
||
'--disable-web-security',
|
||
'--disable-features=TranslateUI',
|
||
'--disable-ipc-flooding-protection',
|
||
'--no-sandbox',
|
||
'--disable-setuid-sandbox'
|
||
]
|
||
)
|
||
|
||
# 创建上下文和页面
|
||
context = await browser.new_context(
|
||
viewport={'width': 1920, 'height': 1080},
|
||
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||
extra_http_headers={
|
||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
||
'Accept-Language': 'en-US,en;q=0.9',
|
||
'Accept-Encoding': 'gzip, deflate, br',
|
||
'DNT': '1',
|
||
'Connection': 'keep-alive',
|
||
'Upgrade-Insecure-Requests': '1',
|
||
}
|
||
)
|
||
|
||
page = await context.new_page()
|
||
|
||
# 隐藏自动化特征
|
||
await page.add_init_script("""
|
||
Object.defineProperty(navigator, 'webdriver', {
|
||
get: () => undefined,
|
||
});
|
||
Object.defineProperty(navigator, 'plugins', {
|
||
get: () => [1, 2, 3, 4, 5],
|
||
});
|
||
Object.defineProperty(navigator, 'languages', {
|
||
get: () => ['en-US', 'en'],
|
||
});
|
||
""")
|
||
|
||
# 设置超时时间
|
||
page.set_default_timeout(300000) # 5分钟
|
||
|
||
# 导航到页面
|
||
await page.goto(url, wait_until="domcontentloaded")
|
||
|
||
# 检查页面状态
|
||
page_title = await page.title()
|
||
logger.info(f"页面标题: {page_title}")
|
||
|
||
# 检查是否是Cloudflare挑战页面
|
||
if "请稍候" in page_title or "Checking" in page_title or "Verifying" in page_title:
|
||
logger.info("检测到Cloudflare挑战页面,等待用户手动验证...")
|
||
|
||
# 等待用户手动完成验证
|
||
try:
|
||
# 等待页面标题变化或特定元素出现
|
||
await page.wait_for_function(
|
||
"""() => {
|
||
const title = document.title;
|
||
return !title.includes('请稍候') &&
|
||
!title.includes('Checking') &&
|
||
!title.includes('Verifying') &&
|
||
title !== '请稍候…';
|
||
}""",
|
||
timeout=300000 # 5分钟
|
||
)
|
||
logger.info("Cloudflare挑战已完成")
|
||
except Exception as e:
|
||
logger.warning(f"等待Cloudflare挑战超时: {e}")
|
||
|
||
# 如果超时,尝试刷新页面
|
||
await page.reload(wait_until="domcontentloaded")
|
||
logger.info("已刷新页面")
|
||
|
||
# 等待页面加载
|
||
await page.wait_for_timeout(5000)
|
||
|
||
# 获取当前页面URL
|
||
current_url = page.url
|
||
logger.info(f"当前页面URL: {current_url}")
|
||
|
||
# 检查是否重定向到其他页面
|
||
if current_url != url:
|
||
logger.warning(f"页面已重定向: {url} -> {current_url}")
|
||
|
||
# 尝试提取产品信息
|
||
product_info = {'url': url}
|
||
|
||
# 提取产品名称
|
||
name_selectors = [
|
||
"h1",
|
||
"[data-test='product-name']",
|
||
".product-name",
|
||
"title"
|
||
]
|
||
|
||
for selector in name_selectors:
|
||
try:
|
||
element = await page.query_selector(selector)
|
||
if element:
|
||
name = await element.text_content()
|
||
if name and name.strip() and name.strip() != "www.producthunt.com":
|
||
product_info['name'] = name.strip()
|
||
logger.info(f"提取到产品名称: {product_info['name']}")
|
||
break
|
||
except Exception as e:
|
||
logger.debug(f"选择器 {selector} 失败: {e}")
|
||
|
||
if 'name' not in product_info:
|
||
# 从URL中提取产品名称
|
||
parsed_url = urlparse(url)
|
||
path_parts = parsed_url.path.split('/')
|
||
if len(path_parts) >= 3 and path_parts[-2] == 'products':
|
||
product_info['name'] = path_parts[-1].replace('-', ' ').title()
|
||
logger.info(f"从URL提取产品名称: {product_info['name']}")
|
||
else:
|
||
product_info['name'] = "Unknown Product"
|
||
logger.warning("无法提取产品名称")
|
||
|
||
# 提取其他信息(简化版本)
|
||
product_info['introduction'] = None
|
||
product_info['user_count'] = None
|
||
product_info['maker_link'] = None
|
||
product_info['maker_statement'] = None
|
||
|
||
# 关闭浏览器
|
||
await browser.close()
|
||
await playwright.stop()
|
||
|
||
logger.success(f"抓取完成: {product_info['name']}")
|
||
return product_info
|
||
|
||
except Exception as e:
|
||
logger.error(f"抓取失败: {e}")
|
||
return {'url': url, 'name': 'Error', 'introduction': None, 'user_count': None, 'maker_link': None, 'maker_statement': None}
|
||
|
||
async def run_test(self):
|
||
"""运行测试"""
|
||
# 从tophub_data.db获取ProductHunt链接
|
||
tophub_db_path = os.path.join(os.path.dirname(self.db_path), "..", "tophub_data.db")
|
||
|
||
conn = sqlite3.connect(tophub_db_path)
|
||
cursor = conn.cursor()
|
||
|
||
# 查询包含producthunt.com的链接
|
||
cursor.execute("""
|
||
SELECT url FROM articles
|
||
WHERE url LIKE '%producthunt.com%'
|
||
LIMIT 3
|
||
""")
|
||
|
||
urls = [row[0] for row in cursor.fetchall()]
|
||
conn.close()
|
||
|
||
logger.info(f"找到 {len(urls)} 个ProductHunt链接")
|
||
|
||
# 处理每个URL
|
||
for url in urls:
|
||
logger.info(f"处理URL: {url}")
|
||
|
||
# 检查是否重复(注释掉跳过逻辑以强制重新抓取)
|
||
# if self.check_duplicate(url):
|
||
# logger.info(f"链接已存在,跳过: {url}")
|
||
# continue
|
||
|
||
# 抓取产品信息
|
||
product_info = await self.scrape_with_stealth(url)
|
||
|
||
# 保存到数据库
|
||
self.save_product_info(product_info)
|
||
|
||
# 统计结果
|
||
conn = sqlite3.connect(self.db_path)
|
||
cursor = conn.cursor()
|
||
cursor.execute("SELECT COUNT(*) FROM products")
|
||
count = cursor.fetchone()[0]
|
||
|
||
cursor.execute("SELECT name, url FROM products")
|
||
products = cursor.fetchall()
|
||
conn.close()
|
||
|
||
logger.success("测试任务完成")
|
||
|
||
print("\n=== 测试结果统计 ===")
|
||
print(f"数据库中的产品数量: {count}")
|
||
print("已抓取的产品:")
|
||
for name, url in products:
|
||
print(f" - {name}: {url}")
|
||
|
||
async def main():
|
||
"""主函数"""
|
||
# 配置日志
|
||
logger.remove()
|
||
logger.add(
|
||
"advanced_scraper.log",
|
||
level="DEBUG",
|
||
format="{time:YYYY-MM-DD HH:mm:ss} | {level:<8} | {name}:{function}:{line} - {message}",
|
||
rotation="10 MB",
|
||
retention="7 days"
|
||
)
|
||
|
||
# 创建抓取器实例
|
||
scraper = AdvancedProductHuntScraper()
|
||
|
||
# 运行测试
|
||
await scraper.run_test()
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(main()) |