修改了了入库的逻辑,如果3天内已存在相同标题的文章,则跳过入库。
This commit is contained in:
@@ -84,20 +84,37 @@ def parse_file_content(file_path):
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
def check_duplicate(title, date_str):
|
def check_duplicate(title, date_str):
|
||||||
"""检查标题+日期是否已存在"""
|
"""检查标题在最近三天(前天、昨天和今天)是否已存在"""
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
conn = sqlite3.connect('tophub_data.db')
|
conn = sqlite3.connect('tophub_data.db')
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 将输入日期字符串转换为datetime对象
|
||||||
|
current_date = datetime.strptime(date_str, '%Y-%m-%d')
|
||||||
|
|
||||||
|
# 计算前天、昨天和今天的日期
|
||||||
|
yesterday = current_date - timedelta(days=1)
|
||||||
|
day_before_yesterday = current_date - timedelta(days=2)
|
||||||
|
|
||||||
|
# 检查这三天内是否有相同标题的文章
|
||||||
cursor.execute('''
|
cursor.execute('''
|
||||||
SELECT COUNT(*) FROM articles
|
SELECT COUNT(*) FROM articles
|
||||||
WHERE title = ? AND source_date = ?
|
WHERE title = ? AND source_date IN (?, ?, ?)
|
||||||
''', (title, date_str))
|
''', (title,
|
||||||
|
day_before_yesterday.strftime('%Y-%m-%d'),
|
||||||
|
yesterday.strftime('%Y-%m-%d'),
|
||||||
|
date_str))
|
||||||
|
|
||||||
count = cursor.fetchone()[0]
|
count = cursor.fetchone()[0]
|
||||||
conn.close()
|
logger.info(f"检查标题 '{title}' 在最近三天的重复情况: 找到 {count} 条相同记录")
|
||||||
|
|
||||||
return count > 0
|
return count > 0
|
||||||
|
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
def classify_title(title):
|
def classify_title(title):
|
||||||
"""调用API对标题进行分类"""
|
"""调用API对标题进行分类"""
|
||||||
try:
|
try:
|
||||||
@@ -193,7 +210,7 @@ def process_temp_files():
|
|||||||
|
|
||||||
# 检查重复
|
# 检查重复
|
||||||
if check_duplicate(article['title'], source_date):
|
if check_duplicate(article['title'], source_date):
|
||||||
logger.info(f"跳过重复文章: {article['title']}")
|
logger.info(f"跳过重复文章(最近三天已存在): {article['title']}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 分类标题
|
# 分类标题
|
||||||
|
|||||||
Reference in New Issue
Block a user