diff --git a/tophub_add_data_to_db.py b/tophub_add_data_to_db.py index f0f6b99..d0f5233 100644 --- a/tophub_add_data_to_db.py +++ b/tophub_add_data_to_db.py @@ -84,19 +84,36 @@ def parse_file_content(file_path): return [] def check_duplicate(title, date_str): - """检查标题+日期是否已存在""" + """检查标题在最近三天(前天、昨天和今天)是否已存在""" + from datetime import datetime, timedelta + conn = sqlite3.connect('tophub_data.db') cursor = conn.cursor() - cursor.execute(''' - SELECT COUNT(*) FROM articles - WHERE title = ? AND source_date = ? - ''', (title, date_str)) - - count = cursor.fetchone()[0] - conn.close() - - return count > 0 + try: + # 将输入日期字符串转换为datetime对象 + current_date = datetime.strptime(date_str, '%Y-%m-%d') + + # 计算前天、昨天和今天的日期 + yesterday = current_date - timedelta(days=1) + day_before_yesterday = current_date - timedelta(days=2) + + # 检查这三天内是否有相同标题的文章 + cursor.execute(''' + SELECT COUNT(*) FROM articles + WHERE title = ? AND source_date IN (?, ?, ?) + ''', (title, + day_before_yesterday.strftime('%Y-%m-%d'), + yesterday.strftime('%Y-%m-%d'), + date_str)) + + count = cursor.fetchone()[0] + logger.info(f"检查标题 '{title}' 在最近三天的重复情况: 找到 {count} 条相同记录") + + return count > 0 + + finally: + conn.close() def classify_title(title): """调用API对标题进行分类""" @@ -193,7 +210,7 @@ def process_temp_files(): # 检查重复 if check_duplicate(article['title'], source_date): - logger.info(f"跳过重复文章: {article['title']}") + logger.info(f"跳过重复文章(最近三天已存在): {article['title']}") continue # 分类标题