product/new_data_playwright.py

import os
import json
import time
from datetime import datetime
from playwright.sync_api import sync_playwright
from loguru import logger

class ProductHuntScraper:
    def __init__(self):
        self.browser = None
        self.context = None
        self.page = None
        self.product_url = "https://www.producthunt.com/products/elsie-ai-beta"
        
    def connect_to_browser(self):
        """连接到浏览器"""
        try:
            logger.info("正在初始化Playwright浏览器...")
            
            # 启动Playwright
            self.playwright = sync_playwright().start()
            
            # 启动Chromium浏览器
            self.browser = self.playwright.chromium.launch(
                headless=False,  # 设置为False以便观察浏览器行为
                args=[
                    "--no-sandbox",
                    "--disable-dev-shm-usage",
                    "--disable-gpu",
                    "--window-size=1920,1080"
                ]
            )
            
            # 创建浏览器上下文
            self.context = self.browser.new_context(
                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
                viewport={"width": 1920, "height": 1080}
            )
            
            # 创建新页面
            self.page = self.context.new_page()
            
            logger.info("成功连接到Playwright浏览器")
            return True
                    
        except Exception as e:
            logger.error(f"连接浏览器失败: {str(e)}")
            return False
    
    def navigate_to_product(self):
        """导航到产品页面"""
        try:
            logger.info(f"正在导航到产品页面: {self.product_url}")
            
            # 首先访问主页
            logger.info("首先访问ProductHunt主页")
            self.page.goto("https://www.producthunt.com/", wait_until="domcontentloaded")
            time.sleep(3)  # 等待页面加载
            
            # 然后访问产品页面
            self.page.goto(self.product_url, wait_until="domcontentloaded")
            
            # 等待页面加载
            logger.info("等待页面加载...")
            time.sleep(10)  # 等待动态内容加载
            
            logger.info("页面加载完成")
            return True
        except Exception as e:
            logger.error(f"导航到产品页面失败: {str(e)}")
            return False
    
    def extract_product_info(self):
        """提取产品信息"""
        try:
            logger.info("开始提取产品信息")
            
            product_info = {
                "url": self.product_url,
                "scraped_at": datetime.now().isoformat()
            }
            
            # 提取产品名称 (h1标签)
            try:
                name_element = self.page.query_selector("h1")
                if name_element:
                    product_info["name"] = name_element.text_content().strip()
                    logger.info(f"产品名称: {product_info['name']}")
                else:
                    logger.warning("未找到产品名称 (h1标签)")
                    product_info["name"] = "未找到"
            except Exception as e:
                logger.warning(f"提取产品名称时出错: {str(e)}")
                product_info["name"] = "未找到"
            
            # 提取产品简介 (class为"relative text-16 font-normal text-gray-700"的div)
            try:
                desc_selector = "div.relative.text-16.font-normal.text-gray-700"
                desc_element = self.page.query_selector(desc_selector)
                if desc_element:
                    product_info["description"] = desc_element.text_content().strip()
                    logger.info(f"产品简介: {product_info['description'][:50]}...")
                else:
                    logger.warning(f"未找到产品简介 ({desc_selector})")
                    product_info["description"] = "未找到"
            except Exception as e:
                logger.warning(f"提取产品简介时出错: {str(e)}")
                product_info["description"] = "未找到"
            
            # 提取第一个评论 (class为"flex flex-1 flex-col gap-2"的div)
            try:
                comment_selector = "div.flex.flex-1.flex-col.gap-2"
                comment_element = self.page.query_selector(comment_selector)
                if comment_element:
                    product_info["first_comment"] = comment_element.text_content().strip()
                    logger.info(f"第一个评论: {product_info['first_comment'][:50]}...")
                else:
                    logger.warning(f"未找到第一个评论 ({comment_selector})")
                    product_info["first_comment"] = "未找到"
            except Exception as e:
                logger.warning(f"提取第一个评论时出错: {str(e)}")
                product_info["first_comment"] = "未找到"
            
            # 尝试提取其他有用信息
            try:
                # 尝试获取产品标签
                tag_elements = self.page.query_selector_all("[class*='tag'], [class*='category'], [class*='topic']")
                if tag_elements:
                    tags = [tag.text_content().strip() for tag in tag_elements if tag.text_content().strip()]
                    product_info["tags"] = tags[:5]  # 最多取5个标签
                    logger.info(f"找到标签: {tags[:3]}")
            except Exception as e:
                logger.debug(f"提取标签时出错: {str(e)}")
            
            # 尝试获取点赞数
            try:
                like_elements = self.page.query_selector_all("[class*='vote'], [class*='like'], [class*='upvote']")
                if like_elements:
                    product_info["likes"] = like_elements[0].text_content().strip()
                    logger.info(f"点赞数: {product_info['likes']}")
            except Exception as e:
                logger.debug(f"提取点赞数时出错: {str(e)}")
            
            # 尝试获取评论数
            try:
                comment_count_elements = self.page.query_selector_all("[class*='comment-count'], [class*='comments']")
                if comment_count_elements:
                    product_info["comment_count"] = comment_count_elements[0].text_content().strip()
                    logger.info(f"评论数: {product_info['comment_count']}")
            except Exception as e:
                logger.debug(f"提取评论数时出错: {str(e)}")
            
            return product_info
            
        except Exception as e:
            logger.error(f"提取产品信息失败: {str(e)}")
            return None
    
    def save_to_file(self, data, filename="product_info.json"):
        """保存数据到文件"""
        try:
            with open(filename, "w", encoding="utf-8") as f:
                json.dump(data, f, ensure_ascii=False, indent=2)
            logger.info(f"数据已保存到 {filename}")
            return True
        except Exception as e:
            logger.error(f"保存数据失败: {str(e)}")
            return False
    
    def save_screenshot(self, filename="product_screenshot.png"):
        """保存页面截图，用于调试"""
        try:
            self.page.screenshot(path=filename)
            logger.info(f"页面截图已保存到 {filename}")
            return True
        except Exception as e:
            logger.error(f"保存页面截图失败: {str(e)}")
            return False
    
    def close(self):
        """关闭浏览器"""
        try:
            if self.context:
                self.context.close()
            if self.browser:
                self.browser.close()
            if self.playwright:
                self.playwright.stop()
            logger.info("浏览器已关闭")
        except Exception as e:
            logger.error(f"关闭浏览器时出错: {str(e)}")
    
    def scrape_product(self):
        """执行完整的抓取流程"""
        if not self.connect_to_browser():
            logger.error("无法连接到浏览器")
            return False
        
        try:
            if not self.navigate_to_product():
                logger.error("无法导航到产品页面")
                return False
            
            # 保存截图用于调试
            self.save_screenshot()
            
            product_info = self.extract_product_info()
            if product_info:
                self.save_to_file(product_info)
                return True
            else:
                logger.error("未能提取产品信息")
                return False
        finally:
            self.close()

def main():
    logger.info("开始ProductHunt产品信息抓取")
    scraper = ProductHuntScraper()
    
    # 可以修改product_url来抓取其他产品
    # scraper.product_url = "https://www.producthunt.com/products/your-product"
    
    success = scraper.scrape_product()
    
    if success:
        logger.info("产品信息抓取完成")
    else:
        logger.error("产品信息抓取失败")

if __name__ == "__main__":
    main()
增加对producthunt网站的数据爬取 2025-11-17 07:39:45 +08:00			`import os`
			`import json`
			`import time`
			`from datetime import datetime`
			`from playwright.sync_api import sync_playwright`
			`from loguru import logger`

			`class ProductHuntScraper:`
			`def __init__(self):`
			`self.browser = None`
			`self.context = None`
			`self.page = None`
			`self.product_url = "https://www.producthunt.com/products/elsie-ai-beta"`

			`def connect_to_browser(self):`
			`"""连接到浏览器"""`
			`try:`
			`logger.info("正在初始化Playwright浏览器...")`

			`# 启动Playwright`
			`self.playwright = sync_playwright().start()`

			`# 启动Chromium浏览器`
			`self.browser = self.playwright.chromium.launch(`
			`headless=False, # 设置为False以便观察浏览器行为`
			`args=[`
			`"--no-sandbox",`
			`"--disable-dev-shm-usage",`
			`"--disable-gpu",`
			`"--window-size=1920,1080"`
			`]`
			`)`

			`# 创建浏览器上下文`
			`self.context = self.browser.new_context(`
			`user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",`
			`viewport={"width": 1920, "height": 1080}`
			`)`

			`# 创建新页面`
			`self.page = self.context.new_page()`

			`logger.info("成功连接到Playwright浏览器")`
			`return True`

			`except Exception as e:`
			`logger.error(f"连接浏览器失败: {str(e)}")`
			`return False`

			`def navigate_to_product(self):`
			`"""导航到产品页面"""`
			`try:`
			`logger.info(f"正在导航到产品页面: {self.product_url}")`

			`# 首先访问主页`
			`logger.info("首先访问ProductHunt主页")`
			`self.page.goto("https://www.producthunt.com/", wait_until="domcontentloaded")`
			`time.sleep(3) # 等待页面加载`

			`# 然后访问产品页面`
			`self.page.goto(self.product_url, wait_until="domcontentloaded")`

			`# 等待页面加载`
			`logger.info("等待页面加载...")`
			`time.sleep(10) # 等待动态内容加载`

			`logger.info("页面加载完成")`
			`return True`
			`except Exception as e:`
			`logger.error(f"导航到产品页面失败: {str(e)}")`
			`return False`

			`def extract_product_info(self):`
			`"""提取产品信息"""`
			`try:`
			`logger.info("开始提取产品信息")`

			`product_info = {`
			`"url": self.product_url,`
			`"scraped_at": datetime.now().isoformat()`
			`}`

			`# 提取产品名称 (h1标签)`
			`try:`
			`name_element = self.page.query_selector("h1")`
			`if name_element:`
			`product_info["name"] = name_element.text_content().strip()`
			`logger.info(f"产品名称: {product_info['name']}")`
			`else:`
			`logger.warning("未找到产品名称 (h1标签)")`
			`product_info["name"] = "未找到"`
			`except Exception as e:`
			`logger.warning(f"提取产品名称时出错: {str(e)}")`
			`product_info["name"] = "未找到"`

			`# 提取产品简介 (class为"relative text-16 font-normal text-gray-700"的div)`
			`try:`
			`desc_selector = "div.relative.text-16.font-normal.text-gray-700"`
			`desc_element = self.page.query_selector(desc_selector)`
			`if desc_element:`
			`product_info["description"] = desc_element.text_content().strip()`
			`logger.info(f"产品简介: {product_info['description'][:50]}...")`
			`else:`
			`logger.warning(f"未找到产品简介 ({desc_selector})")`
			`product_info["description"] = "未找到"`
			`except Exception as e:`
			`logger.warning(f"提取产品简介时出错: {str(e)}")`
			`product_info["description"] = "未找到"`

			`# 提取第一个评论 (class为"flex flex-1 flex-col gap-2"的div)`
			`try:`
			`comment_selector = "div.flex.flex-1.flex-col.gap-2"`
			`comment_element = self.page.query_selector(comment_selector)`
			`if comment_element:`
			`product_info["first_comment"] = comment_element.text_content().strip()`
			`logger.info(f"第一个评论: {product_info['first_comment'][:50]}...")`
			`else:`
			`logger.warning(f"未找到第一个评论 ({comment_selector})")`
			`product_info["first_comment"] = "未找到"`
			`except Exception as e:`
			`logger.warning(f"提取第一个评论时出错: {str(e)}")`
			`product_info["first_comment"] = "未找到"`

			`# 尝试提取其他有用信息`
			`try:`
			`# 尝试获取产品标签`
			`tag_elements = self.page.query_selector_all("[class='tag'], [class='category'], [class*='topic']")`
			`if tag_elements:`
			`tags = [tag.text_content().strip() for tag in tag_elements if tag.text_content().strip()]`
			`product_info["tags"] = tags[:5] # 最多取5个标签`
			`logger.info(f"找到标签: {tags[:3]}")`
			`except Exception as e:`
			`logger.debug(f"提取标签时出错: {str(e)}")`

			`# 尝试获取点赞数`
			`try:`
			`like_elements = self.page.query_selector_all("[class='vote'], [class='like'], [class*='upvote']")`
			`if like_elements:`
			`product_info["likes"] = like_elements[0].text_content().strip()`
			`logger.info(f"点赞数: {product_info['likes']}")`
			`except Exception as e:`
			`logger.debug(f"提取点赞数时出错: {str(e)}")`

			`# 尝试获取评论数`
			`try:`
			`comment_count_elements = self.page.query_selector_all("[class='comment-count'], [class='comments']")`
			`if comment_count_elements:`
			`product_info["comment_count"] = comment_count_elements[0].text_content().strip()`
			`logger.info(f"评论数: {product_info['comment_count']}")`
			`except Exception as e:`
			`logger.debug(f"提取评论数时出错: {str(e)}")`

			`return product_info`

			`except Exception as e:`
			`logger.error(f"提取产品信息失败: {str(e)}")`
			`return None`

			`def save_to_file(self, data, filename="product_info.json"):`
			`"""保存数据到文件"""`
			`try:`
			`with open(filename, "w", encoding="utf-8") as f:`
			`json.dump(data, f, ensure_ascii=False, indent=2)`
			`logger.info(f"数据已保存到 {filename}")`
			`return True`
			`except Exception as e:`
			`logger.error(f"保存数据失败: {str(e)}")`
			`return False`

			`def save_screenshot(self, filename="product_screenshot.png"):`
			`"""保存页面截图，用于调试"""`
			`try:`
			`self.page.screenshot(path=filename)`
			`logger.info(f"页面截图已保存到 {filename}")`
			`return True`
			`except Exception as e:`
			`logger.error(f"保存页面截图失败: {str(e)}")`
			`return False`

			`def close(self):`
			`"""关闭浏览器"""`
			`try:`
			`if self.context:`
			`self.context.close()`
			`if self.browser:`
			`self.browser.close()`
			`if self.playwright:`
			`self.playwright.stop()`
			`logger.info("浏览器已关闭")`
			`except Exception as e:`
			`logger.error(f"关闭浏览器时出错: {str(e)}")`

			`def scrape_product(self):`
			`"""执行完整的抓取流程"""`
			`if not self.connect_to_browser():`
			`logger.error("无法连接到浏览器")`
			`return False`

			`try:`
			`if not self.navigate_to_product():`
			`logger.error("无法导航到产品页面")`
			`return False`

			`# 保存截图用于调试`
			`self.save_screenshot()`

			`product_info = self.extract_product_info()`
			`if product_info:`
			`self.save_to_file(product_info)`
			`return True`
			`else:`
			`logger.error("未能提取产品信息")`
			`return False`
			`finally:`
			`self.close()`

			`def main():`
			`logger.info("开始ProductHunt产品信息抓取")`
			`scraper = ProductHuntScraper()`

			`# 可以修改product_url来抓取其他产品`
			`# scraper.product_url = "https://www.producthunt.com/products/your-product"`

			`success = scraper.scrape_product()`

			`if success:`
			`logger.info("产品信息抓取完成")`
			`else:`
			`logger.error("产品信息抓取失败")`

			`if __name__ == "__main__":`
			`main()`