更新今天的数据

This commit is contained in:
2025-11-20 20:57:09 +08:00
parent 8258c3532d
commit 12b57f1c57
8 changed files with 6640 additions and 2332 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -22,6 +22,8 @@ class ProductHuntScraper:
self.debug_port = debug_port
self.browser = None
self.page = None
self.click_records = [] # 记录点击行为
self.dom_selection_records = [] # 记录DOM选取行为
async def connect_to_existing_chrome(self):
"""连接到已运行的Chrome实例"""
@@ -58,7 +60,47 @@ class ProductHuntScraper:
except Exception as e:
logger.error(f"连接Chrome失败: {e}")
return False
async def record_click(self, x, y, selector="", description=""):
"""记录点击行为"""
click_record = {
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"type": "click",
"x": x,
"y": y,
"selector": selector,
"description": description
}
self.click_records.append(click_record)
logger.info(f"记录点击: {description} - 坐标({x}, {y}) - 选择器: {selector}")
async def record_dom_selection(self, selector, description=""):
"""记录DOM选取行为"""
dom_record = {
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"type": "dom_selection",
"selector": selector,
"description": description
}
self.dom_selection_records.append(dom_record)
logger.info(f"记录DOM选取: {description} - 选择器: {selector}")
async def save_behavior_records(self):
"""保存行为记录到文件"""
import json
records = {
"click_records": self.click_records,
"dom_selection_records": self.dom_selection_records
}
filename = f"playwright_behavior_records_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(filename, "w", encoding="utf-8") as f:
json.dump(records, f, ensure_ascii=False, indent=2)
logger.success(f"行为记录已保存到: {filename}")
async def navigate_to_producthunt(self, url):
"""导航到ProductHunt页面"""
if not self.page:
@@ -115,6 +157,7 @@ class ProductHuntScraper:
logger.info("模拟点击制作人链接...")
# 查找包含制作人信息的div容器class="flex flex-col gap-1"
await self.record_dom_selection('div.flex.flex-col.gap-1', "制作人信息容器")
div_container = await self.page.query_selector('div.flex.flex-col.gap-1')
if not div_container:
logger.warning("未找到class='flex flex-col gap-1'的div容器使用备用方法")
@@ -156,9 +199,17 @@ class ProductHuntScraper:
logger.info(f"点击位置: ({center_x:.1f}, {center_y:.1f})")
# 记录点击行为
await self.record_click(center_x, center_y, 'div.flex.flex-col.gap-1', "制作人链接点击")
# 先模拟点击,然后监听新窗口打开事件
# 添加动态点击效果:先移动到位置,短暂停留,然后点击
await self.page.mouse.move(center_x, center_y)
await self.page.wait_for_timeout(2000) # 短暂停留2000毫秒模拟用户移动鼠标
# 监听新窗口打开事件
async with self.page.context.expect_page() as new_page_info:
# 模拟点击计算出的位置
# 执行点击操作
await self.page.mouse.click(center_x, center_y)
# 获取新页面
@@ -171,11 +222,13 @@ class ProductHuntScraper:
logger.success("新窗口已加载完成")
# 抓取第一个section的tag
await self.record_dom_selection('section', "新窗口第一个section标签")
first_section = await new_page.query_selector('section')
if first_section:
logger.success("找到第一个section标签")
# 在section下面找一个没有任何class的div标签
await self.record_dom_selection('div:not([class])', "section下无class的div标签")
div_without_class = await first_section.query_selector('div:not([class])')
if div_without_class:
logger.success("找到无class的div标签")
@@ -216,17 +269,20 @@ class ProductHuntScraper:
new_page = await self.browser.new_page()
# 导航到制作人页面
await new_page.goto(maker_link, wait_until="domcontentloaded", timeout=3000000)
await new_page.goto(maker_link, wait_until="domcontentloaded", timeout=30000)
# 等待页面加载
await new_page.wait_for_timeout(3000)
await new_page.wait_for_timeout(15000)
logger.info("页面加载等待完成,开始提取内容...")
# 抓取第一个section的tag
await self.record_dom_selection('section', "备用方法-新窗口第一个section标签")
first_section = await new_page.query_selector('section')
if first_section:
logger.success("找到第一个section标签")
# 在section下面找一个没有任何class的div标签
await self.record_dom_selection('div:not([class])', "备用方法-section下无class的div标签")
div_without_class = await first_section.query_selector('div:not([class])')
if div_without_class:
logger.success("找到无class的div标签")
@@ -248,6 +304,10 @@ class ProductHuntScraper:
result = maker_text
logger.info(f"制作人发言(回退a标签): {maker_text[:200]}...")
# 添加充分延迟,确保内容完全加载
logger.info("等待内容完全稳定...")
await new_page.wait_for_timeout(3000)
# 关闭新页面
await new_page.close()
logger.info("新窗口已关闭")
@@ -269,6 +329,7 @@ class ProductHuntScraper:
product_info = {}
# 提取产品名称h1标签
await self.record_dom_selection("h1", "产品名称")
name_element = await self.page.query_selector("h1")
if name_element:
product_info["name"] = (await name_element.text_content()).strip()
@@ -277,6 +338,7 @@ class ProductHuntScraper:
# 提取产品简介class为"relative text-16 font-normal text-gray-700"的div
logger.info("正在提取产品简介...")
try:
await self.record_dom_selection('div.relative.text-16.font-normal.text-gray-700', "产品简介")
intro_div = await self.page.query_selector('div.relative.text-16.font-normal.text-gray-700')
if intro_div:
product_info["introduction"] = (await intro_div.text_content()).strip()
@@ -290,6 +352,7 @@ class ProductHuntScraper:
logger.info("等待制作人发言动态加载...")
try:
# 等待section标签出现最长等待60秒
await self.record_dom_selection('section.flex.flex-col.gap-2', "制作人发言区域")
section_element = await self.page.wait_for_selector(
'section.flex.flex-col.gap-2',
timeout=60000
@@ -298,6 +361,7 @@ class ProductHuntScraper:
logger.success("制作人发言区域已加载")
# 查找section标签下面的第一个a标签
await self.record_dom_selection('a', "制作人链接")
a_element = await section_element.query_selector('a')
if a_element:
# 提取a标签的文本内容
@@ -412,6 +476,10 @@ async def main():
with open("product_info.json", "w", encoding="utf-8") as f:
json.dump(product_info, f, ensure_ascii=False, indent=2)
logger.info("产品信息已保存到 product_info.json")
# 保存点击和DOM选取行为记录
await scraper.save_behavior_records()
logger.info("行为记录已保存到 behavior_records.json")
else:
logger.warning("未能提取到产品信息")

View File

@@ -2,6 +2,5 @@
"name": "Notion",
"introduction": "Notion is an all-in-one workspace that combines note-taking, project management, and task organization. It allows users to create customized databases, documents, and calendars to streamline their personal and professional workflows.",
"maker_link": "https://www.producthunt.com/products/notion/launches/ai-meeting-notes-by-notion",
"maker_statement": "AI Meeting Notes by Notion",
"user_count": "15K followers"
"maker_statement": "AI Meeting Notes by Notion"
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 631 KiB

After

Width:  |  Height:  |  Size: 804 KiB

View File

@@ -6,6 +6,6 @@
制作人发言: AI Meeting Notes by Notion
用户数: 15K followers
用户数: 未获取
提取时间: 2025-11-18 22:38:38
提取时间: 2025-11-19 22:46:52

File diff suppressed because it is too large Load Diff

Binary file not shown.

File diff suppressed because it is too large Load Diff