Files
tophux_scrape/product/new_data_simple.py

172 lines
6.8 KiB
Python

import os
import json
import time
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from loguru import logger
class ProductHuntScraper:
def __init__(self):
self.driver = None
self.product_url = "https://www.producthunt.com/products/elsie-ai-beta"
def connect_to_chrome(self):
"""连接到Chrome实例"""
try:
logger.info("正在初始化Chrome驱动...")
# 配置Chrome选项
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
# 尝试直接使用ChromeDriver
try:
self.driver = webdriver.Chrome(options=chrome_options)
logger.info("成功连接到Chrome实例")
return True
except Exception as e:
logger.error(f"使用ChromeDriver连接失败: {str(e)}")
# 尝试使用系统Chrome
try:
chrome_options.binary_location = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"
self.driver = webdriver.Chrome(options=chrome_options)
logger.info("成功连接到系统Chrome实例")
return True
except Exception as e2:
logger.error(f"使用系统Chrome连接失败: {str(e2)}")
return False
except Exception as e:
logger.error(f"连接Chrome实例失败: {str(e)}")
return False
def navigate_to_product(self):
"""导航到产品页面"""
try:
logger.info(f"正在导航到产品页面: {self.product_url}")
self.driver.get(self.product_url)
# 等待页面加载
WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
logger.info("页面加载完成")
return True
except TimeoutException:
logger.error("页面加载超时")
return False
except Exception as e:
logger.error(f"导航到产品页面失败: {str(e)}")
return False
def extract_product_info(self):
"""提取产品信息"""
try:
logger.info("开始提取产品信息")
# 等待页面完全加载
time.sleep(5)
product_info = {
"url": self.product_url,
"scraped_at": datetime.now().isoformat()
}
# 提取产品名称 (h1标签)
try:
name_element = self.driver.find_element(By.TAG_NAME, "h1")
product_info["name"] = name_element.text.strip()
logger.info(f"产品名称: {product_info['name']}")
except NoSuchElementException:
logger.warning("未找到产品名称 (h1标签)")
product_info["name"] = "未找到"
# 提取产品简介 (class为"relative text-16 font-normal text-gray-700"的div)
try:
desc_selector = "div.relative.text-16.font-normal.text-gray-700"
desc_element = self.driver.find_element(By.CSS_SELECTOR, desc_selector)
product_info["description"] = desc_element.text.strip()
logger.info(f"产品简介: {product_info['description'][:50]}...")
except NoSuchElementException:
logger.warning("未找到产品简介 (div.relative.text-16.font-normal.text-gray-700)")
product_info["description"] = "未找到"
# 提取第一个评论 (class为"flex flex-1 flex-col gap-2"的div)
try:
comment_selector = "div.flex.flex-1.flex-col.gap-2"
comment_element = self.driver.find_element(By.CSS_SELECTOR, comment_selector)
product_info["first_comment"] = comment_element.text.strip()
logger.info(f"第一个评论: {product_info['first_comment'][:50]}...")
except NoSuchElementException:
logger.warning("未找到第一个评论 (div.flex.flex-1.flex-col.gap-2)")
product_info["first_comment"] = "未找到"
return product_info
except Exception as e:
logger.error(f"提取产品信息失败: {str(e)}")
return None
def save_to_file(self, data, filename="product_info.json"):
"""保存数据到文件"""
try:
with open(filename, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
logger.info(f"数据已保存到 {filename}")
return True
except Exception as e:
logger.error(f"保存数据失败: {str(e)}")
return False
def close(self):
"""关闭浏览器"""
if self.driver:
self.driver.quit()
logger.info("浏览器已关闭")
def scrape_product(self):
"""执行完整的抓取流程"""
if not self.connect_to_chrome():
logger.error("无法连接到Chrome实例")
return False
try:
if not self.navigate_to_product():
logger.error("无法导航到产品页面")
return False
product_info = self.extract_product_info()
if product_info:
self.save_to_file(product_info)
return True
else:
logger.error("未能提取产品信息")
return False
finally:
self.close()
def main():
logger.info("开始ProductHunt产品信息抓取")
scraper = ProductHuntScraper()
# 可以修改product_url来抓取其他产品
# scraper.product_url = "https://www.producthunt.com/products/your-product"
success = scraper.scrape_product()
if success:
logger.info("产品信息抓取完成")
else:
logger.error("产品信息抓取失败")
if __name__ == "__main__":
main()