今天数据更新
This commit is contained in:
@@ -151,115 +151,74 @@ class ProductHuntScraper:
|
||||
logger.error(f"访问页面失败: {e}")
|
||||
return False
|
||||
|
||||
async def extract_maker_statement_from_new_window(self, maker_link, maker_text):
|
||||
"""模拟点击链接在新窗口中提取制作人发言内容"""
|
||||
async def extract_maker_statement_from_current_window(self, maker_link, maker_text):
|
||||
"""在当前窗口中提取制作人发言"""
|
||||
if not maker_link:
|
||||
logger.warning("制作人链接为空")
|
||||
return ""
|
||||
|
||||
if not self.page:
|
||||
logger.error("当前页面未初始化")
|
||||
return ""
|
||||
|
||||
try:
|
||||
logger.info("模拟点击制作人链接...")
|
||||
# 记录点击制作人链接的行为
|
||||
await self.record_click("制作人链接", "点击制作人链接在当前窗口打开")
|
||||
|
||||
# 查找包含制作人信息的div容器(class="flex flex-col gap-1")
|
||||
await self.record_dom_selection('div.flex.flex-col.gap-1', "制作人信息容器")
|
||||
div_container = await self.page.query_selector('div.flex.flex-col.gap-1')
|
||||
if not div_container:
|
||||
logger.warning("未找到class='flex flex-col gap-1'的div容器,使用备用方法")
|
||||
# 备用方法:直接打开新窗口
|
||||
return await self._extract_maker_statement_direct_open(maker_link, maker_text)
|
||||
# 在当前页面导航到制作人链接
|
||||
logger.info(f"正在在当前窗口打开制作人链接: {maker_link}")
|
||||
await self.page.goto(maker_link, wait_until="domcontentloaded")
|
||||
|
||||
# 获取div容器的边界框,用于点击中间位置
|
||||
bbox = await div_container.bounding_box()
|
||||
if not bbox:
|
||||
logger.warning("无法获取div容器边界框,使用备用方法")
|
||||
return await self._extract_maker_statement_direct_open(maker_link, maker_text)
|
||||
# 等待页面加载
|
||||
await self.page.wait_for_load_state("networkidle")
|
||||
|
||||
# 计算div容器中前面几个元素的高度总和
|
||||
# 获取div容器内的所有子元素
|
||||
child_elements = await div_container.query_selector_all('*')
|
||||
|
||||
# 计算前面几个元素的高度总和
|
||||
total_height = 0
|
||||
element_count = 0
|
||||
max_elements = 3 # 考虑前面3个元素的高度
|
||||
|
||||
for child in child_elements[:max_elements]:
|
||||
child_bbox = await child.bounding_box()
|
||||
if child_bbox:
|
||||
total_height += child_bbox['height']
|
||||
element_count += 1
|
||||
logger.debug(f"元素{element_count}高度: {child_bbox['height']:.1f}px")
|
||||
|
||||
# 如果无法获取子元素高度,使用div容器高度的一半
|
||||
if total_height == 0:
|
||||
center_y = bbox['y'] + bbox['height'] / 2
|
||||
logger.info("使用div容器高度的一半作为点击位置")
|
||||
else:
|
||||
# 计算点击位置:div容器的y坐标 + 前面元素高度总和
|
||||
center_y = bbox['y'] + total_height
|
||||
logger.info(f"使用前面{element_count}个元素高度总和作为点击位置")
|
||||
|
||||
center_x = bbox['x'] + bbox['width'] / 2
|
||||
|
||||
logger.info(f"点击位置: ({center_x:.1f}, {center_y:.1f})")
|
||||
|
||||
# 记录点击行为
|
||||
await self.record_click(center_x, center_y, 'div.flex.flex-col.gap-1', "制作人链接点击")
|
||||
|
||||
# 先模拟点击,然后监听新窗口打开事件
|
||||
# 添加动态点击效果:先移动到位置,短暂停留,然后点击
|
||||
await self.page.mouse.move(center_x, center_y)
|
||||
await self.page.wait_for_timeout(2000) # 短暂停留2000毫秒,模拟用户移动鼠标
|
||||
|
||||
# 监听新窗口打开事件
|
||||
async with self.page.context.expect_page() as new_page_info:
|
||||
# 执行点击操作
|
||||
await self.page.mouse.click(center_x, center_y)
|
||||
|
||||
# 获取新页面
|
||||
new_page = await new_page_info.value
|
||||
|
||||
# 等待新页面加载完成
|
||||
await new_page.wait_for_load_state("domcontentloaded")
|
||||
await new_page.wait_for_timeout(5000) # 额外等待2秒确保内容加载
|
||||
|
||||
logger.success("新窗口已加载完成")
|
||||
|
||||
# 抓取第一个section的tag
|
||||
await self.record_dom_selection('section', "新窗口第一个section标签")
|
||||
first_section = await new_page.query_selector('section')
|
||||
if first_section:
|
||||
logger.success("找到第一个section标签")
|
||||
# 等待title元素出现并包含产品名称(最长等待2分钟)
|
||||
logger.info("等待title元素出现并包含产品名称(最长等待2分钟)...")
|
||||
try:
|
||||
# 等待title元素出现,最长等待2分钟
|
||||
await self.page.wait_for_selector("title", timeout=120000)
|
||||
|
||||
# 在section下面找一个没有任何class的div标签
|
||||
await self.record_dom_selection('div:not([class])', "section下无class的div标签")
|
||||
div_without_class = await first_section.query_selector('div:not([class])')
|
||||
if div_without_class:
|
||||
logger.success("找到无class的div标签")
|
||||
|
||||
# 提取div及其子标签的所有文本内容
|
||||
maker_statement = await div_without_class.inner_text()
|
||||
result = maker_statement.strip()
|
||||
|
||||
logger.info(f"制作人发言(新窗口): {result[:2000]}...")
|
||||
# 检查title是否包含产品名称
|
||||
title_text = await self.page.title()
|
||||
logger.info(f"页面标题: {title_text}")
|
||||
|
||||
# 获取产品名称(从maker_text参数中获取)
|
||||
product_name = maker_text.strip() if maker_text else ""
|
||||
|
||||
if product_name and product_name.lower() in title_text.lower():
|
||||
logger.success(f"标题包含产品名称: {product_name}")
|
||||
else:
|
||||
logger.warning("未找到无class的div标签")
|
||||
# 回退到提取section的文本内容
|
||||
section_text = await first_section.inner_text()
|
||||
result = section_text.strip()
|
||||
logger.info(f"制作人发言(回退section): {result[:200]}...")
|
||||
else:
|
||||
logger.warning("未找到section标签")
|
||||
# 回退到原始a标签文本
|
||||
result = maker_text
|
||||
logger.info(f"制作人发言(回退a标签): {maker_text[:200]}...")
|
||||
logger.warning(f"标题不包含产品名称,产品名称: {product_name}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"等待title元素失败: {e}")
|
||||
|
||||
# 关闭新页面
|
||||
await new_page.close()
|
||||
logger.info("新窗口已关闭")
|
||||
# 再等待30秒,确保页面完全加载
|
||||
logger.info("再等待30秒,确保页面完全加载...")
|
||||
await self.page.wait_for_timeout(30000) # 等待30秒
|
||||
|
||||
return result
|
||||
# 提取制作人评论内容(XPath: //*[@id=\"comment-4597755\"]/div/div[2]/div/div/div)
|
||||
logger.info("正在提取制作人评论内容...")
|
||||
try:
|
||||
# 使用XPath查找评论元素
|
||||
comment_element = await self.page.query_selector(
|
||||
'xpath=//*[@id="comment-4597755"]/div/div[2]/div/div/div'
|
||||
)
|
||||
if comment_element:
|
||||
maker_statement = (await comment_element.text_content()).strip()
|
||||
logger.info(f"制作人评论内容: {maker_statement[:200]}...")
|
||||
|
||||
return maker_statement
|
||||
else:
|
||||
logger.warning("未找到XPath为//*[@id=\"comment-4597755\"]/div/div[2]/div/div/div的元素")
|
||||
except Exception as e:
|
||||
logger.error(f"提取制作人评论内容失败: {e}")
|
||||
|
||||
except Exception as new_page_error:
|
||||
logger.error(f"模拟点击操作失败: {new_page_error}")
|
||||
# 如果模拟点击失败,使用备用方法
|
||||
return await self._extract_maker_statement_direct_open(maker_link, maker_text)
|
||||
return ""
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"在当前窗口打开制作人链接失败: {e}")
|
||||
return ""
|
||||
|
||||
async def _extract_maker_statement_direct_open(self, maker_link, maker_text):
|
||||
"""备用方法:直接在新窗口中打开链接"""
|
||||
@@ -328,41 +287,63 @@ class ProductHuntScraper:
|
||||
try:
|
||||
product_info = {}
|
||||
|
||||
# 提取产品名称(h1标签)
|
||||
await self.record_dom_selection("h1", "产品名称")
|
||||
name_element = await self.page.query_selector("h1")
|
||||
if name_element:
|
||||
product_info["name"] = (await name_element.text_content()).strip()
|
||||
logger.info(f"产品名称: {product_info['name']}")
|
||||
# 提取产品名称(XPath: //h1)
|
||||
logger.info("正在提取产品名称...")
|
||||
try:
|
||||
await self.record_dom_selection("//h1", "产品名称")
|
||||
name_element = await self.page.query_selector("xpath=//h1")
|
||||
if name_element:
|
||||
product_info["name"] = (await name_element.text_content()).strip()
|
||||
logger.info(f"产品名称: {product_info['name']}")
|
||||
else:
|
||||
logger.warning("未找到XPath为//h1的元素")
|
||||
except Exception as e:
|
||||
logger.error(f"提取产品名称失败: {e}")
|
||||
|
||||
# 提取产品简介(class为"relative text-16 font-normal text-gray-700"的div)
|
||||
# 提取产品简介(XPath: //*[@class=\"relative text-16 font-normal text-gray-700\"]//div)
|
||||
logger.info("正在提取产品简介...")
|
||||
try:
|
||||
await self.record_dom_selection('div.relative.text-16.font-normal.text-gray-700', "产品简介")
|
||||
intro_div = await self.page.query_selector('div.relative.text-16.font-normal.text-gray-700')
|
||||
if intro_div:
|
||||
product_info["introduction"] = (await intro_div.text_content()).strip()
|
||||
await self.record_dom_selection('//*[@class="relative text-16 font-normal text-gray-700"]//div', "产品简介")
|
||||
intro_element = await self.page.query_selector('xpath=//*[@class="relative text-16 font-normal text-gray-700"]//div')
|
||||
if intro_element:
|
||||
product_info["introduction"] = (await intro_element.text_content()).strip()
|
||||
logger.info(f"产品简介: {product_info['introduction'][:200]}...")
|
||||
else:
|
||||
logger.warning("未找到class为'relative text-16 font-normal text-gray-700'的div")
|
||||
logger.warning("未找到XPath为//*[@class=\"relative text-16 font-normal text-gray-700\"]//div的元素")
|
||||
except Exception as e:
|
||||
logger.error(f"提取产品简介失败: {e}")
|
||||
|
||||
# 等待制作人发言动态加载(等待class="flex flex-col gap-2"的section标签出现)
|
||||
logger.info("等待制作人发言动态加载...")
|
||||
# 提取用户数(XPath: //*[@class=\"flex flex-row gap-2\"]//div/div[2]/span/p)
|
||||
logger.info("正在提取用户数...")
|
||||
try:
|
||||
# 等待section标签出现,最长等待60秒
|
||||
await self.record_dom_selection('section.flex.flex-col.gap-2', "制作人发言区域")
|
||||
section_element = await self.page.wait_for_selector(
|
||||
'section.flex.flex-col.gap-2',
|
||||
timeout=60000
|
||||
)
|
||||
if section_element:
|
||||
logger.success("制作人发言区域已加载")
|
||||
await self.record_dom_selection('//*[@class="flex flex-row gap-2"]//div/div[2]/span/p', "用户数")
|
||||
user_count_element = await self.page.query_selector('xpath=//*[@class="flex flex-row gap-2"]//div/div[2]/span/p')
|
||||
if user_count_element:
|
||||
product_info["user_count"] = (await user_count_element.text_content()).strip()
|
||||
logger.info(f"用户数: {product_info['user_count']}")
|
||||
else:
|
||||
logger.warning("未找到XPath为//*[@class=\"flex flex-row gap-2\"]//div/div[2]/span/p的元素")
|
||||
except Exception as e:
|
||||
logger.error(f"提取用户数失败: {e}")
|
||||
|
||||
# 提取制作人发言链接(XPath: //span[contains(@class, \"absolute\")]的父级a标签)
|
||||
logger.info("正在提取制作人发言链接...")
|
||||
try:
|
||||
# 增加显性等待,等待页面元素加载完成
|
||||
logger.info("等待页面元素加载...")
|
||||
await self.page.wait_for_timeout(20000) # 等待20秒
|
||||
|
||||
# 先找到包含class="absolute"的span元素
|
||||
await self.record_dom_selection('//span[contains(@class, "absolute")]', "制作人span标签")
|
||||
span_element = await self.page.query_selector('xpath=//span[contains(@class, "absolute")]')
|
||||
if span_element:
|
||||
# 找到span元素的父级a标签
|
||||
await self.record_dom_selection('//span[contains(@class, "absolute")]/parent::a', "制作人链接")
|
||||
|
||||
# 查找section标签下面的第一个a标签
|
||||
await self.record_dom_selection('a', "制作人链接")
|
||||
a_element = await section_element.query_selector('a')
|
||||
# 使用更可靠的方法获取父级a标签
|
||||
a_element = await span_element.evaluate_handle('(element) => element.closest("a")')
|
||||
|
||||
# 检查a_element是否为有效的元素句柄
|
||||
if a_element:
|
||||
# 提取a标签的文本内容
|
||||
maker_text = (await a_element.text_content()).strip()
|
||||
@@ -381,35 +362,14 @@ class ProductHuntScraper:
|
||||
product_info["maker_link"] = maker_link
|
||||
logger.info(f"制作人链接: {maker_link}")
|
||||
|
||||
# 调用子函数在新窗口中提取制作人发言
|
||||
product_info["maker_statement"] = await self.extract_maker_statement_from_new_window(maker_link, maker_text)
|
||||
|
||||
# 调用子函数在当前窗口中提取制作人发言
|
||||
product_info["maker_statement"] = await self.extract_maker_statement_from_current_window(maker_link, maker_text)
|
||||
else:
|
||||
logger.warning("在section中未找到a标签")
|
||||
# 如果没有a标签,尝试查找span标签
|
||||
span_element = await section_element.query_selector('span')
|
||||
if span_element:
|
||||
product_info["maker_statement"] = (await span_element.text_content()).strip()
|
||||
logger.info(f"制作人发言(回退span): {product_info['maker_statement'][:200]}...")
|
||||
else:
|
||||
logger.warning("未找到span标签")
|
||||
|
||||
logger.warning("未找到制作人链接的a标签")
|
||||
else:
|
||||
logger.warning("制作人发言区域未加载")
|
||||
logger.warning("未找到XPath为//span[contains(@class, \"absolute\")]的元素")
|
||||
except Exception as e:
|
||||
logger.error(f"等待制作人发言加载失败: {e}")
|
||||
|
||||
# 提取用户数(class="text-14 font-medium text-gray-700"的p标签)
|
||||
logger.info("正在提取用户数...")
|
||||
try:
|
||||
user_count_element = await self.page.query_selector('p.text-14.font-medium.text-gray-700')
|
||||
if user_count_element:
|
||||
product_info["user_count"] = (await user_count_element.text_content()).strip()
|
||||
logger.info(f"用户数: {product_info['user_count']}")
|
||||
else:
|
||||
logger.warning("未找到class为'text-14 font-medium text-gray-700'的p标签")
|
||||
except Exception as e:
|
||||
logger.error(f"提取用户数失败: {e}")
|
||||
logger.error(f"提取制作人发言链接失败: {e}")
|
||||
|
||||
# 保存到临时文件
|
||||
temp_file_path = "temp_product_info.txt"
|
||||
|
||||
Reference in New Issue
Block a user