From d79051cb241a130829910a60ee1a4db00a5c0699 Mon Sep 17 00:00:00 2001 From: xiaji Date: Sun, 9 Nov 2025 20:30:41 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86=E4=BA=86=E5=85=A5?= =?UTF-8?q?=E5=BA=93=E7=9A=84=E9=80=BB=E8=BE=91=EF=BC=8C=E5=A6=82=E6=9E=9C?= =?UTF-8?q?3=E5=A4=A9=E5=86=85=E5=B7=B2=E5=AD=98=E5=9C=A8=E7=9B=B8?= =?UTF-8?q?=E5=90=8C=E6=A0=87=E9=A2=98=E7=9A=84=E6=96=87=E7=AB=A0=EF=BC=8C?= =?UTF-8?q?=E5=88=99=E8=B7=B3=E8=BF=87=E5=85=A5=E5=BA=93=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tophub_add_data_to_db.py | 39 ++++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/tophub_add_data_to_db.py b/tophub_add_data_to_db.py index f0f6b99..d0f5233 100644 --- a/tophub_add_data_to_db.py +++ b/tophub_add_data_to_db.py @@ -84,19 +84,36 @@ def parse_file_content(file_path): return [] def check_duplicate(title, date_str): - """检查标题+日期是否已存在""" + """检查标题在最近三天(前天、昨天和今天)是否已存在""" + from datetime import datetime, timedelta + conn = sqlite3.connect('tophub_data.db') cursor = conn.cursor() - cursor.execute(''' - SELECT COUNT(*) FROM articles - WHERE title = ? AND source_date = ? - ''', (title, date_str)) - - count = cursor.fetchone()[0] - conn.close() - - return count > 0 + try: + # 将输入日期字符串转换为datetime对象 + current_date = datetime.strptime(date_str, '%Y-%m-%d') + + # 计算前天、昨天和今天的日期 + yesterday = current_date - timedelta(days=1) + day_before_yesterday = current_date - timedelta(days=2) + + # 检查这三天内是否有相同标题的文章 + cursor.execute(''' + SELECT COUNT(*) FROM articles + WHERE title = ? AND source_date IN (?, ?, ?) + ''', (title, + day_before_yesterday.strftime('%Y-%m-%d'), + yesterday.strftime('%Y-%m-%d'), + date_str)) + + count = cursor.fetchone()[0] + logger.info(f"检查标题 '{title}' 在最近三天的重复情况: 找到 {count} 条相同记录") + + return count > 0 + + finally: + conn.close() def classify_title(title): """调用API对标题进行分类""" @@ -193,7 +210,7 @@ def process_temp_files(): # 检查重复 if check_duplicate(article['title'], source_date): - logger.info(f"跳过重复文章: {article['title']}") + logger.info(f"跳过重复文章(最近三天已存在): {article['title']}") continue # 分类标题