更新今天的数据

2025-11-20 20:57:09 +08:00
parent 8258c3532d
commit 12b57f1c57
8 changed files with 6640 additions and 2332 deletions
--- a/2025年11月20日201319.txt
+++ b/2025年11月20日201319.txt
--- a/product/playwright-get-data.py
+++ b/product/playwright-get-data.py
@@ -22,6 +22,8 @@ class ProductHuntScraper:
        self.debug_port = debug_port
        self.browser = None
        self.page = None
        self.click_records = []  # 记录点击行为
        self.dom_selection_records = []  # 记录DOM选取行为
    async def connect_to_existing_chrome(self):
        """连接到已运行的Chrome实例"""
@@ -58,7 +60,47 @@ class ProductHuntScraper:
        except Exception as e:
            logger.error(f"连接Chrome失败: {e}")
            return False
-    
+
    async def record_click(self, x, y, selector="", description=""):
        """记录点击行为"""
        click_record = {
            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "type": "click",
            "x": x,
            "y": y,
            "selector": selector,
            "description": description
        }
        self.click_records.append(click_record)
        logger.info(f"记录点击: {description} - 坐标({x}, {y}) - 选择器: {selector}")
    async def record_dom_selection(self, selector, description=""):
        """记录DOM选取行为"""
        dom_record = {
            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "type": "dom_selection",
            "selector": selector,
            "description": description
        }
        self.dom_selection_records.append(dom_record)
        logger.info(f"记录DOM选取: {description} - 选择器: {selector}")
    async def save_behavior_records(self):
        """保存行为记录到文件"""
        import json
        records = {
            "click_records": self.click_records,
            "dom_selection_records": self.dom_selection_records
        }
        filename = f"playwright_behavior_records_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(records, f, ensure_ascii=False, indent=2)
        logger.success(f"行为记录已保存到: {filename}")
    async def navigate_to_producthunt(self, url):
        """导航到ProductHunt页面"""
        if not self.page:
@@ -115,6 +157,7 @@ class ProductHuntScraper:
            logger.info("模拟点击制作人链接...")
            # 查找包含制作人信息的div容器（class="flex flex-col gap-1"）
            await self.record_dom_selection('div.flex.flex-col.gap-1', "制作人信息容器")
            div_container = await self.page.query_selector('div.flex.flex-col.gap-1')
            if not div_container:
                logger.warning("未找到class='flex flex-col gap-1'的div容器，使用备用方法")
@@ -156,9 +199,17 @@ class ProductHuntScraper:
            logger.info(f"点击位置: ({center_x:.1f}, {center_y:.1f})")
            # 记录点击行为
            await self.record_click(center_x, center_y, 'div.flex.flex-col.gap-1', "制作人链接点击")
            # 先模拟点击，然后监听新窗口打开事件
            # 添加动态点击效果：先移动到位置，短暂停留，然后点击
            await self.page.mouse.move(center_x, center_y)
            await self.page.wait_for_timeout(2000)  # 短暂停留2000毫秒，模拟用户移动鼠标
            # 监听新窗口打开事件
            async with self.page.context.expect_page() as new_page_info:
-                # 模拟点击计算出的位置
+                # 执行点击操作
                await self.page.mouse.click(center_x, center_y)
            # 获取新页面
@@ -171,11 +222,13 @@ class ProductHuntScraper:
            logger.success("新窗口已加载完成")
            # 抓取第一个section的tag
            await self.record_dom_selection('section', "新窗口第一个section标签")
            first_section = await new_page.query_selector('section')
            if first_section:
                logger.success("找到第一个section标签")
                # 在section下面找一个没有任何class的div标签
                await self.record_dom_selection('div:not([class])', "section下无class的div标签")
                div_without_class = await first_section.query_selector('div:not([class])')
                if div_without_class:
                    logger.success("找到无class的div标签")
@@ -216,17 +269,20 @@ class ProductHuntScraper:
            new_page = await self.browser.new_page()
            # 导航到制作人页面
-            await new_page.goto(maker_link, wait_until="domcontentloaded", timeout=3000000)
+            await new_page.goto(maker_link, wait_until="domcontentloaded", timeout=30000)
            # 等待页面加载
-            await new_page.wait_for_timeout(3000)
+            await new_page.wait_for_timeout(15000)
            logger.info("页面加载等待完成，开始提取内容...")
            # 抓取第一个section的tag
            await self.record_dom_selection('section', "备用方法-新窗口第一个section标签")
            first_section = await new_page.query_selector('section')
            if first_section:
                logger.success("找到第一个section标签")
                # 在section下面找一个没有任何class的div标签
                await self.record_dom_selection('div:not([class])', "备用方法-section下无class的div标签")
                div_without_class = await first_section.query_selector('div:not([class])')
                if div_without_class:
                    logger.success("找到无class的div标签")
@@ -248,6 +304,10 @@ class ProductHuntScraper:
                result = maker_text
                logger.info(f"制作人发言(回退a标签): {maker_text[:200]}...")
            # 添加充分延迟，确保内容完全加载
            logger.info("等待内容完全稳定...")
            await new_page.wait_for_timeout(3000)
            # 关闭新页面
            await new_page.close()
            logger.info("新窗口已关闭")
@@ -269,6 +329,7 @@ class ProductHuntScraper:
            product_info = {}
            # 提取产品名称（h1标签）
            await self.record_dom_selection("h1", "产品名称")
            name_element = await self.page.query_selector("h1")
            if name_element:
                product_info["name"] = (await name_element.text_content()).strip()
@@ -277,6 +338,7 @@ class ProductHuntScraper:
            # 提取产品简介（class为"relative text-16 font-normal text-gray-700"的div）
            logger.info("正在提取产品简介...")
            try:
                await self.record_dom_selection('div.relative.text-16.font-normal.text-gray-700', "产品简介")
                intro_div = await self.page.query_selector('div.relative.text-16.font-normal.text-gray-700')
                if intro_div:
                    product_info["introduction"] = (await intro_div.text_content()).strip()
@@ -290,6 +352,7 @@ class ProductHuntScraper:
            logger.info("等待制作人发言动态加载...")
            try:
                # 等待section标签出现，最长等待60秒
                await self.record_dom_selection('section.flex.flex-col.gap-2', "制作人发言区域")
                section_element = await self.page.wait_for_selector(
                    'section.flex.flex-col.gap-2', 
                    timeout=60000
@@ -298,6 +361,7 @@ class ProductHuntScraper:
                    logger.success("制作人发言区域已加载")
                    # 查找section标签下面的第一个a标签
                    await self.record_dom_selection('a', "制作人链接")
                    a_element = await section_element.query_selector('a')
                    if a_element:
                        # 提取a标签的文本内容
@@ -412,6 +476,10 @@ async def main():
            with open("product_info.json", "w", encoding="utf-8") as f:
                json.dump(product_info, f, ensure_ascii=False, indent=2)
            logger.info("产品信息已保存到 product_info.json")
            # 保存点击和DOM选取行为记录
            await scraper.save_behavior_records()
            logger.info("行为记录已保存到 behavior_records.json")
        else:
            logger.warning("未能提取到产品信息")
--- a/product_info.json
+++ b/product_info.json
@@ -2,6 +2,5 @@
  "name": "Notion",
  "introduction": "Notion is an all-in-one workspace that combines note-taking, project management, and task organization. It allows users to create customized databases, documents, and calendars to streamline their personal and professional workflows.",
  "maker_link": "https://www.producthunt.com/products/notion/launches/ai-meeting-notes-by-notion",
-  "maker_statement": "AI Meeting Notes by Notion",
+  "maker_statement": "AI Meeting Notes by Notion"
  "user_count": "15K followers"
 }
--- a/product_screenshot.png
+++ b/product_screenshot.png
--- a/temp_product_info.txt
+++ b/temp_product_info.txt
@@ -6,6 +6,6 @@
 制作人发言: AI Meeting Notes by Notion
-用户数: 15K followers
+用户数: 未获取
-提取时间: 2025-11-18 22:38:38
+提取时间: 2025-11-19 22:46:52
--- a/tophub_add_data_to_db.log
+++ b/tophub_add_data_to_db.log
--- a/tophub_data.db
+++ b/tophub_data.db
--- a/tophub_scraper.log
+++ b/tophub_scraper.log