更新今天的数据
This commit is contained in:
@@ -22,6 +22,8 @@ class ProductHuntScraper:
|
||||
self.debug_port = debug_port
|
||||
self.browser = None
|
||||
self.page = None
|
||||
self.click_records = [] # 记录点击行为
|
||||
self.dom_selection_records = [] # 记录DOM选取行为
|
||||
|
||||
async def connect_to_existing_chrome(self):
|
||||
"""连接到已运行的Chrome实例"""
|
||||
@@ -58,7 +60,47 @@ class ProductHuntScraper:
|
||||
except Exception as e:
|
||||
logger.error(f"连接Chrome失败: {e}")
|
||||
return False
|
||||
|
||||
|
||||
async def record_click(self, x, y, selector="", description=""):
|
||||
"""记录点击行为"""
|
||||
click_record = {
|
||||
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"type": "click",
|
||||
"x": x,
|
||||
"y": y,
|
||||
"selector": selector,
|
||||
"description": description
|
||||
}
|
||||
self.click_records.append(click_record)
|
||||
logger.info(f"记录点击: {description} - 坐标({x}, {y}) - 选择器: {selector}")
|
||||
|
||||
async def record_dom_selection(self, selector, description=""):
|
||||
"""记录DOM选取行为"""
|
||||
dom_record = {
|
||||
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"type": "dom_selection",
|
||||
"selector": selector,
|
||||
"description": description
|
||||
}
|
||||
self.dom_selection_records.append(dom_record)
|
||||
logger.info(f"记录DOM选取: {description} - 选择器: {selector}")
|
||||
|
||||
async def save_behavior_records(self):
|
||||
"""保存行为记录到文件"""
|
||||
import json
|
||||
|
||||
records = {
|
||||
"click_records": self.click_records,
|
||||
"dom_selection_records": self.dom_selection_records
|
||||
}
|
||||
|
||||
filename = f"playwright_behavior_records_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
||||
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
json.dump(records, f, ensure_ascii=False, indent=2)
|
||||
|
||||
logger.success(f"行为记录已保存到: {filename}")
|
||||
|
||||
async def navigate_to_producthunt(self, url):
|
||||
"""导航到ProductHunt页面"""
|
||||
if not self.page:
|
||||
@@ -115,6 +157,7 @@ class ProductHuntScraper:
|
||||
logger.info("模拟点击制作人链接...")
|
||||
|
||||
# 查找包含制作人信息的div容器(class="flex flex-col gap-1")
|
||||
await self.record_dom_selection('div.flex.flex-col.gap-1', "制作人信息容器")
|
||||
div_container = await self.page.query_selector('div.flex.flex-col.gap-1')
|
||||
if not div_container:
|
||||
logger.warning("未找到class='flex flex-col gap-1'的div容器,使用备用方法")
|
||||
@@ -156,9 +199,17 @@ class ProductHuntScraper:
|
||||
|
||||
logger.info(f"点击位置: ({center_x:.1f}, {center_y:.1f})")
|
||||
|
||||
# 记录点击行为
|
||||
await self.record_click(center_x, center_y, 'div.flex.flex-col.gap-1', "制作人链接点击")
|
||||
|
||||
# 先模拟点击,然后监听新窗口打开事件
|
||||
# 添加动态点击效果:先移动到位置,短暂停留,然后点击
|
||||
await self.page.mouse.move(center_x, center_y)
|
||||
await self.page.wait_for_timeout(2000) # 短暂停留2000毫秒,模拟用户移动鼠标
|
||||
|
||||
# 监听新窗口打开事件
|
||||
async with self.page.context.expect_page() as new_page_info:
|
||||
# 模拟点击计算出的位置
|
||||
# 执行点击操作
|
||||
await self.page.mouse.click(center_x, center_y)
|
||||
|
||||
# 获取新页面
|
||||
@@ -171,11 +222,13 @@ class ProductHuntScraper:
|
||||
logger.success("新窗口已加载完成")
|
||||
|
||||
# 抓取第一个section的tag
|
||||
await self.record_dom_selection('section', "新窗口第一个section标签")
|
||||
first_section = await new_page.query_selector('section')
|
||||
if first_section:
|
||||
logger.success("找到第一个section标签")
|
||||
|
||||
# 在section下面找一个没有任何class的div标签
|
||||
await self.record_dom_selection('div:not([class])', "section下无class的div标签")
|
||||
div_without_class = await first_section.query_selector('div:not([class])')
|
||||
if div_without_class:
|
||||
logger.success("找到无class的div标签")
|
||||
@@ -216,17 +269,20 @@ class ProductHuntScraper:
|
||||
new_page = await self.browser.new_page()
|
||||
|
||||
# 导航到制作人页面
|
||||
await new_page.goto(maker_link, wait_until="domcontentloaded", timeout=3000000)
|
||||
await new_page.goto(maker_link, wait_until="domcontentloaded", timeout=30000)
|
||||
|
||||
# 等待页面加载
|
||||
await new_page.wait_for_timeout(3000)
|
||||
await new_page.wait_for_timeout(15000)
|
||||
logger.info("页面加载等待完成,开始提取内容...")
|
||||
|
||||
# 抓取第一个section的tag
|
||||
await self.record_dom_selection('section', "备用方法-新窗口第一个section标签")
|
||||
first_section = await new_page.query_selector('section')
|
||||
if first_section:
|
||||
logger.success("找到第一个section标签")
|
||||
|
||||
# 在section下面找一个没有任何class的div标签
|
||||
await self.record_dom_selection('div:not([class])', "备用方法-section下无class的div标签")
|
||||
div_without_class = await first_section.query_selector('div:not([class])')
|
||||
if div_without_class:
|
||||
logger.success("找到无class的div标签")
|
||||
@@ -248,6 +304,10 @@ class ProductHuntScraper:
|
||||
result = maker_text
|
||||
logger.info(f"制作人发言(回退a标签): {maker_text[:200]}...")
|
||||
|
||||
# 添加充分延迟,确保内容完全加载
|
||||
logger.info("等待内容完全稳定...")
|
||||
await new_page.wait_for_timeout(3000)
|
||||
|
||||
# 关闭新页面
|
||||
await new_page.close()
|
||||
logger.info("新窗口已关闭")
|
||||
@@ -269,6 +329,7 @@ class ProductHuntScraper:
|
||||
product_info = {}
|
||||
|
||||
# 提取产品名称(h1标签)
|
||||
await self.record_dom_selection("h1", "产品名称")
|
||||
name_element = await self.page.query_selector("h1")
|
||||
if name_element:
|
||||
product_info["name"] = (await name_element.text_content()).strip()
|
||||
@@ -277,6 +338,7 @@ class ProductHuntScraper:
|
||||
# 提取产品简介(class为"relative text-16 font-normal text-gray-700"的div)
|
||||
logger.info("正在提取产品简介...")
|
||||
try:
|
||||
await self.record_dom_selection('div.relative.text-16.font-normal.text-gray-700', "产品简介")
|
||||
intro_div = await self.page.query_selector('div.relative.text-16.font-normal.text-gray-700')
|
||||
if intro_div:
|
||||
product_info["introduction"] = (await intro_div.text_content()).strip()
|
||||
@@ -290,6 +352,7 @@ class ProductHuntScraper:
|
||||
logger.info("等待制作人发言动态加载...")
|
||||
try:
|
||||
# 等待section标签出现,最长等待60秒
|
||||
await self.record_dom_selection('section.flex.flex-col.gap-2', "制作人发言区域")
|
||||
section_element = await self.page.wait_for_selector(
|
||||
'section.flex.flex-col.gap-2',
|
||||
timeout=60000
|
||||
@@ -298,6 +361,7 @@ class ProductHuntScraper:
|
||||
logger.success("制作人发言区域已加载")
|
||||
|
||||
# 查找section标签下面的第一个a标签
|
||||
await self.record_dom_selection('a', "制作人链接")
|
||||
a_element = await section_element.query_selector('a')
|
||||
if a_element:
|
||||
# 提取a标签的文本内容
|
||||
@@ -412,6 +476,10 @@ async def main():
|
||||
with open("product_info.json", "w", encoding="utf-8") as f:
|
||||
json.dump(product_info, f, ensure_ascii=False, indent=2)
|
||||
logger.info("产品信息已保存到 product_info.json")
|
||||
|
||||
# 保存点击和DOM选取行为记录
|
||||
await scraper.save_behavior_records()
|
||||
logger.info("行为记录已保存到 behavior_records.json")
|
||||
else:
|
||||
logger.warning("未能提取到产品信息")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user