369 lines
15 KiB
Python
369 lines
15 KiB
Python
|
|
import os
|
|||
|
|
import json
|
|||
|
|
import time
|
|||
|
|
from datetime import datetime
|
|||
|
|
from playwright.sync_api import sync_playwright
|
|||
|
|
from loguru import logger
|
|||
|
|
|
|||
|
|
class ProductHuntScraper:
|
|||
|
|
def __init__(self):
|
|||
|
|
self.browser = None
|
|||
|
|
self.context = None
|
|||
|
|
self.page = None
|
|||
|
|
self.product_url = "https://www.producthunt.com/products/elsie-ai-beta"
|
|||
|
|
|
|||
|
|
def connect_to_browser(self):
|
|||
|
|
"""连接到浏览器"""
|
|||
|
|
try:
|
|||
|
|
logger.info("正在初始化Playwright浏览器...")
|
|||
|
|
|
|||
|
|
# 启动Playwright
|
|||
|
|
self.playwright = sync_playwright().start()
|
|||
|
|
|
|||
|
|
# 启动Chromium浏览器
|
|||
|
|
self.browser = self.playwright.chromium.launch(
|
|||
|
|
headless=False, # 设置为False以便观察浏览器行为
|
|||
|
|
args=[
|
|||
|
|
"--no-sandbox",
|
|||
|
|
"--disable-dev-shm-usage",
|
|||
|
|
"--disable-gpu",
|
|||
|
|
"--window-size=1920,1080",
|
|||
|
|
"--disable-blink-features=AutomationControlled",
|
|||
|
|
"--disable-web-security",
|
|||
|
|
"--disable-features=VizDisplayCompositor"
|
|||
|
|
]
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 创建浏览器上下文
|
|||
|
|
self.context = self.browser.new_context(
|
|||
|
|
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|||
|
|
viewport={"width": 1920, "height": 1080},
|
|||
|
|
ignore_https_errors=True,
|
|||
|
|
extra_http_headers={
|
|||
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|||
|
|
"Accept-Encoding": "gzip, deflate, br",
|
|||
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
|
|||
|
|
"Cache-Control": "no-cache",
|
|||
|
|
"Pragma": "no-cache"
|
|||
|
|
}
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 创建新页面
|
|||
|
|
self.page = self.context.new_page()
|
|||
|
|
|
|||
|
|
# 添加额外的初始化脚本,防止被检测为自动化工具
|
|||
|
|
self.page.add_init_script("""
|
|||
|
|
Object.defineProperty(navigator, 'webdriver', {
|
|||
|
|
get: () => undefined
|
|||
|
|
});
|
|||
|
|
|
|||
|
|
// 覆盖permissions API
|
|||
|
|
const originalQuery = window.navigator.permissions.query;
|
|||
|
|
window.navigator.permissions.query = (parameters) => (
|
|||
|
|
parameters.name === 'notifications' ?
|
|||
|
|
Promise.resolve({ state: Notification.permission }) :
|
|||
|
|
originalQuery(parameters)
|
|||
|
|
);
|
|||
|
|
""")
|
|||
|
|
|
|||
|
|
logger.info("成功连接到Playwright浏览器")
|
|||
|
|
return True
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"连接浏览器失败: {str(e)}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def navigate_to_product(self):
|
|||
|
|
"""导航到产品页面"""
|
|||
|
|
try:
|
|||
|
|
logger.info(f"正在导航到产品页面: {self.product_url}")
|
|||
|
|
|
|||
|
|
# 直接访问产品页面,跳过主页
|
|||
|
|
logger.info("直接访问产品页面")
|
|||
|
|
self.page.goto(self.product_url, wait_until="domcontentloaded", timeout=60000)
|
|||
|
|
|
|||
|
|
# 等待页面加载
|
|||
|
|
logger.info("等待页面加载...")
|
|||
|
|
time.sleep(10) # 等待动态内容加载
|
|||
|
|
|
|||
|
|
# 检查页面URL和标题
|
|||
|
|
current_url = self.page.url
|
|||
|
|
page_title = self.page.title()
|
|||
|
|
logger.info(f"当前页面URL: {current_url}")
|
|||
|
|
logger.info(f"页面标题: {page_title}")
|
|||
|
|
|
|||
|
|
# 尝试等待特定元素加载
|
|||
|
|
try:
|
|||
|
|
logger.info("等待页面内容加载...")
|
|||
|
|
# 等待可能的加载指示器消失
|
|||
|
|
self.page.wait_for_selector("body", timeout=30000)
|
|||
|
|
|
|||
|
|
# 尝试等待一些可能存在的元素
|
|||
|
|
possible_selectors = [
|
|||
|
|
"h1",
|
|||
|
|
"[data-test='product-name']",
|
|||
|
|
".product-name",
|
|||
|
|
"div[class*='product']",
|
|||
|
|
"div[class*='styles_']"
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
for selector in possible_selectors:
|
|||
|
|
try:
|
|||
|
|
self.page.wait_for_selector(selector, timeout=5000)
|
|||
|
|
logger.info(f"找到元素: {selector}")
|
|||
|
|
break
|
|||
|
|
except:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.warning(f"等待页面元素时出错: {str(e)}")
|
|||
|
|
|
|||
|
|
logger.info("页面加载完成")
|
|||
|
|
return True
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"导航到产品页面失败: {str(e)}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def extract_product_info(self):
|
|||
|
|
"""提取产品信息"""
|
|||
|
|
try:
|
|||
|
|
logger.info("开始提取产品信息")
|
|||
|
|
|
|||
|
|
product_info = {
|
|||
|
|
"url": self.page.url,
|
|||
|
|
"scraped_at": datetime.now().isoformat()
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 提取产品名称 - 尝试多种选择器
|
|||
|
|
name_selectors = [
|
|||
|
|
"h1",
|
|||
|
|
"[data-test='product-name']",
|
|||
|
|
".product-name",
|
|||
|
|
"[class*='product'][class*='name']",
|
|||
|
|
".styles_productName__",
|
|||
|
|
"[class*='heading'][class*='xl']",
|
|||
|
|
"div[class*='text-2xl']",
|
|||
|
|
"div[class*='text-3xl']",
|
|||
|
|
"div[class*='text-4xl']",
|
|||
|
|
"div[class*='text-5xl']",
|
|||
|
|
"div[class*='text-6xl']",
|
|||
|
|
"div[class*='font-bold']",
|
|||
|
|
"div[class*='font-semibold']"
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
for selector in name_selectors:
|
|||
|
|
try:
|
|||
|
|
name_element = self.page.query_selector(selector)
|
|||
|
|
if name_element:
|
|||
|
|
name_text = name_element.text_content().strip()
|
|||
|
|
if name_text and name_text != "www.producthunt.com" and len(name_text) > 2:
|
|||
|
|
product_info["name"] = name_text
|
|||
|
|
logger.info(f"使用选择器 {selector} 找到产品名称: {product_info['name']}")
|
|||
|
|
break
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.debug(f"使用选择器 {selector} 提取产品名称时出错: {str(e)}")
|
|||
|
|
|
|||
|
|
if "name" not in product_info:
|
|||
|
|
logger.warning("未找到产品名称")
|
|||
|
|
product_info["name"] = "未找到"
|
|||
|
|
|
|||
|
|
# 提取产品简介 - 尝试多种选择器
|
|||
|
|
desc_selectors = [
|
|||
|
|
"div.relative.text-16.font-normal.text-gray-700",
|
|||
|
|
".text-16.font-normal.text-gray-700",
|
|||
|
|
"[class*='text-16'][class*='font-normal'][class*='text-gray-700']",
|
|||
|
|
"div[class*='description']",
|
|||
|
|
".product-description",
|
|||
|
|
"div[class*='tagline']",
|
|||
|
|
"[data-test='product-tagline']",
|
|||
|
|
".styles_tagline__",
|
|||
|
|
"p[class*='text-gray']",
|
|||
|
|
"div[class*='mb-4']",
|
|||
|
|
"div[class*='text-base']",
|
|||
|
|
"div[class*='text-lg']",
|
|||
|
|
"div[class*='text-gray-600']",
|
|||
|
|
"div[class*='text-gray-700']",
|
|||
|
|
"div[class*='text-gray-800']"
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
for selector in desc_selectors:
|
|||
|
|
try:
|
|||
|
|
desc_element = self.page.query_selector(selector)
|
|||
|
|
if desc_element:
|
|||
|
|
desc_text = desc_element.text_content().strip()
|
|||
|
|
if desc_text and len(desc_text) > 10: # 确保是有意义的描述
|
|||
|
|
product_info["description"] = desc_text
|
|||
|
|
logger.info(f"使用选择器 {selector} 找到产品简介: {product_info['description'][:50]}...")
|
|||
|
|
break
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.debug(f"使用选择器 {selector} 提取产品简介时出错: {str(e)}")
|
|||
|
|
|
|||
|
|
if "description" not in product_info:
|
|||
|
|
logger.warning("未找到产品简介")
|
|||
|
|
product_info["description"] = "未找到"
|
|||
|
|
|
|||
|
|
# 提取第一个评论 - 尝试多种选择器
|
|||
|
|
comment_selectors = [
|
|||
|
|
"div.flex.flex-1.flex-col.gap-2",
|
|||
|
|
".flex.flex-1.flex-col.gap-2",
|
|||
|
|
"[class*='flex'][class*='flex-1'][class*='flex-col'][class*='gap-2']",
|
|||
|
|
"div[class*='comment']",
|
|||
|
|
".comment-text",
|
|||
|
|
"div[class*='review']",
|
|||
|
|
"div[class*='feedback']",
|
|||
|
|
"blockquote",
|
|||
|
|
"div[class*='border']",
|
|||
|
|
"[data-test='comment']",
|
|||
|
|
"div[class*='text-sm']",
|
|||
|
|
"div[class*='text-xs']",
|
|||
|
|
"div[class*='mt-2']",
|
|||
|
|
"div[class*='mb-2']"
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
for selector in comment_selectors:
|
|||
|
|
try:
|
|||
|
|
comment_element = self.page.query_selector(selector)
|
|||
|
|
if comment_element:
|
|||
|
|
comment_text = comment_element.text_content().strip()
|
|||
|
|
if comment_text and len(comment_text) > 10: # 确保是有意义的评论
|
|||
|
|
product_info["first_comment"] = comment_text
|
|||
|
|
logger.info(f"使用选择器 {selector} 找到第一个评论: {product_info['first_comment'][:50]}...")
|
|||
|
|
break
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.debug(f"使用选择器 {selector} 提取第一个评论时出错: {str(e)}")
|
|||
|
|
|
|||
|
|
if "first_comment" not in product_info:
|
|||
|
|
logger.warning("未找到第一个评论")
|
|||
|
|
product_info["first_comment"] = "未找到"
|
|||
|
|
|
|||
|
|
# 尝试提取其他有用信息
|
|||
|
|
try:
|
|||
|
|
# 尝试获取产品标签
|
|||
|
|
tag_elements = self.page.query_selector_all("[class*='tag'], [class*='category'], [class*='topic'], [class*='badge']")
|
|||
|
|
if tag_elements:
|
|||
|
|
tags = [tag.text_content().strip() for tag in tag_elements if tag.text_content().strip()]
|
|||
|
|
product_info["tags"] = tags[:5] # 最多取5个标签
|
|||
|
|
logger.info(f"找到标签: {tags[:3]}")
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.debug(f"提取标签时出错: {str(e)}")
|
|||
|
|
|
|||
|
|
# 尝试获取点赞数
|
|||
|
|
try:
|
|||
|
|
like_elements = self.page.query_selector_all("[class*='vote'], [class*='like'], [class*='upvote'], [data-test='vote-count'], [class*='count']")
|
|||
|
|
if like_elements:
|
|||
|
|
product_info["likes"] = like_elements[0].text_content().strip()
|
|||
|
|
logger.info(f"点赞数: {product_info['likes']}")
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.debug(f"提取点赞数时出错: {str(e)}")
|
|||
|
|
|
|||
|
|
# 尝试获取评论数
|
|||
|
|
try:
|
|||
|
|
comment_count_elements = self.page.query_selector_all("[class*='comment-count'], [class*='comments'], [data-test='comment-count'], [class*='count']")
|
|||
|
|
if comment_count_elements:
|
|||
|
|
product_info["comment_count"] = comment_count_elements[0].text_content().strip()
|
|||
|
|
logger.info(f"评论数: {product_info['comment_count']}")
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.debug(f"提取评论数时出错: {str(e)}")
|
|||
|
|
|
|||
|
|
# 尝试获取产品图片
|
|||
|
|
try:
|
|||
|
|
img_elements = self.page.query_selector_all("img[class*='product'], img[alt*='product'], img[alt*='logo'], img[src*='product']")
|
|||
|
|
if img_elements:
|
|||
|
|
product_info["image_url"] = img_elements[0].get_attribute("src")
|
|||
|
|
logger.info(f"产品图片URL: {product_info['image_url']}")
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.debug(f"提取产品图片时出错: {str(e)}")
|
|||
|
|
|
|||
|
|
return product_info
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"提取产品信息失败: {str(e)}")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
def save_to_file(self, data, filename="product_info.json"):
|
|||
|
|
"""保存数据到文件"""
|
|||
|
|
try:
|
|||
|
|
with open(filename, "w", encoding="utf-8") as f:
|
|||
|
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
|||
|
|
logger.info(f"数据已保存到 {filename}")
|
|||
|
|
return True
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"保存数据失败: {str(e)}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def save_screenshot(self, filename="product_screenshot.png"):
|
|||
|
|
"""保存页面截图,用于调试"""
|
|||
|
|
try:
|
|||
|
|
self.page.screenshot(path=filename, full_page=True)
|
|||
|
|
logger.info(f"页面截图已保存到 {filename}")
|
|||
|
|
return True
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"保存页面截图失败: {str(e)}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def save_html(self, filename="product_page.html"):
|
|||
|
|
"""保存页面HTML,用于调试"""
|
|||
|
|
try:
|
|||
|
|
html_content = self.page.content()
|
|||
|
|
with open(filename, "w", encoding="utf-8") as f:
|
|||
|
|
f.write(html_content)
|
|||
|
|
logger.info(f"页面HTML已保存到 {filename}")
|
|||
|
|
return True
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"保存页面HTML失败: {str(e)}")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def close(self):
|
|||
|
|
"""关闭浏览器"""
|
|||
|
|
try:
|
|||
|
|
if self.context:
|
|||
|
|
self.context.close()
|
|||
|
|
if self.browser:
|
|||
|
|
self.browser.close()
|
|||
|
|
if self.playwright:
|
|||
|
|
self.playwright.stop()
|
|||
|
|
logger.info("浏览器已关闭")
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"关闭浏览器时出错: {str(e)}")
|
|||
|
|
|
|||
|
|
def scrape_product(self):
|
|||
|
|
"""执行完整的抓取流程"""
|
|||
|
|
if not self.connect_to_browser():
|
|||
|
|
logger.error("无法连接到浏览器")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
if not self.navigate_to_product():
|
|||
|
|
logger.error("无法导航到产品页面")
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
# 保存截图和HTML用于调试
|
|||
|
|
self.save_screenshot()
|
|||
|
|
self.save_html()
|
|||
|
|
|
|||
|
|
product_info = self.extract_product_info()
|
|||
|
|
if product_info:
|
|||
|
|
self.save_to_file(product_info)
|
|||
|
|
return True
|
|||
|
|
else:
|
|||
|
|
logger.error("未能提取产品信息")
|
|||
|
|
return False
|
|||
|
|
finally:
|
|||
|
|
self.close()
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
logger.info("开始ProductHunt产品信息抓取")
|
|||
|
|
scraper = ProductHuntScraper()
|
|||
|
|
|
|||
|
|
# 可以修改product_url来抓取其他产品
|
|||
|
|
# scraper.product_url = "https://www.producthunt.com/products/your-product"
|
|||
|
|
|
|||
|
|
success = scraper.scrape_product()
|
|||
|
|
|
|||
|
|
if success:
|
|||
|
|
logger.info("产品信息抓取完成")
|
|||
|
|
else:
|
|||
|
|
logger.error("产品信息抓取失败")
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|