import docx, json, re, os, sys, time from pathlib import Path from core.db import db # 设置编码 sys.stdout.reconfigure(encoding='utf-8') sys.stderr.reconfigure(encoding='utf-8') # 定义输出文件路径 output_file = './24game_data.json' def extract_data_from_docx(docx_path): print(f"正在读取文档: {docx_path}") try: # 打开Word文档 doc = docx.Document(docx_path) # 初始化数据结构 data = [] current_id = 0 success_count = 0 error_count = 0 skipped_count = 0 no_solution_count = 0 # 遍历文档中的表格 for table_index, table in enumerate(doc.tables): # 跳过表头行 for row_index, row in enumerate(table.rows): if row_index == 0 or len(row.cells) < 2: # 跳过表头和格式不正确的行 skipped_count += 1 continue # 跳过表头行 if row.cells[0].text.strip() == "" or "答案" in row.cells[0].text: skipped_count += 1 continue # 获取单元格内容 try: # 第一个单元格包含四个数字 numbers_text = row.cells[0].text.strip() # 跳过表头或非数据行 if "组合" in numbers_text or "答案" in numbers_text or len(numbers_text) < 3: skipped_count += 1 continue # 尝试匹配格式:1, 2, 3, 4 或 1,2,3,4(中文逗号) numbers_match = re.search(r'(\d+)[,,\s]\s*(\d+)[,,\s]\s*(\d+)[,,\s]\s*(\d+)', numbers_text) # 尝试匹配格式:1 2 3 4(空格分隔) if not numbers_match: numbers_match = re.search(r'(\d+)\s+(\d+)\s+(\d+)\s+(\d+)', numbers_text) # 尝试匹配任意格式的四个数字 if not numbers_match: numbers = re.findall(r'\d+', numbers_text) if len(numbers) >= 4: num1, num2, num3, num4 = map(int, numbers[:4]) else: print(f"警告: 无法解析数字: {numbers_text}") error_count += 1 continue else: num1 = int(numbers_match.group(1)) num2 = int(numbers_match.group(2)) num3 = int(numbers_match.group(3)) num4 = int(numbers_match.group(4)) # 第二个单元格包含解答 solutions_text = row.cells[1].text.strip() # 创建新的组合 current_id += 1 combination = { 'id': current_id, 'n1': num1, 'n2': num2, 'n3': num3, 'n4': num4, 's': [] } # 检查是否有解答 no_solution_patterns = ["无解", "无解答", "无解答案", "无", "无解★", "无解*", "无解*"] has_solution = True for pattern in no_solution_patterns: if pattern in solutions_text: has_solution = False no_solution_count += 1 break # 如果有解答,解析它们 if solutions_text and has_solution: # 分割多个解答(根据图片中的格式,解答可能用分号、分号+空格或其他分隔符分隔) solutions = re.split(r'[;;]\s*|\n|\s+(?=\d+[.、))])', solutions_text) for i, sol in enumerate(solutions, 1): sol = sol.strip() if not sol: continue # 移除可能的序号前缀,如 "1." 或 "(1)" sol = re.sub(r'^\d+[.、))]\s*', '', sol) # 检查星号数量来确定flag值 flag = 0 # 处理不同类型的星号字符 if "@@" in sol : flag = 2 elif '#' in sol : flag = 1 # 移除所有类型的星号 sol = sol.replace("@", "").replace("#", "").replace("*", "").replace("(","(").replace(")",")").strip() # 添加解答 solution = { # 'q': current_id, # 使用当前数据的ID 'c': sol, 'f': flag } combination['s'].append(solution) # 添加到数据列表 data.append(combination) success_count += 1 except Exception as e: print(f"处理行时出错: {e}") error_count += 1 continue print(f"数据提取完成: 成功 {success_count} 条, 错误 {error_count} 条, 跳过 {skipped_count} 条, 无解 {no_solution_count} 条") return data, no_solution_count except Exception as e: print(f"处理文档时出错: {e}") return [], 0 def save_to_json(data, output_path, no_solution_count): print(f"正在保存数据到: {output_path}") try: # 创建输出目录 os.makedirs(os.path.dirname(output_path), exist_ok=True) with open(output_path, 'w', encoding='utf-8') as f: for i, item in enumerate(data): json.dump(item, f, ensure_ascii=False, separators=(',', ':')) f.write('\n') # 统计解答数量 total_solutions = sum(len(item['s']) for item in data) # 统计不同难度的解答 flag0_count = sum(1 for item in data for sol in item['s'] if sol['f'] == 0) flag1_count = sum(1 for item in data for sol in item['s'] if sol['f'] == 1) flag2_count = sum(1 for item in data for sol in item['s'] if sol['f'] == 2) # 输出完整统计信息 print("\n统计信息汇总:") print(f"总组合数: {len(data)} 个") print(f"有解组合: {len(data) - no_solution_count} 个") print(f"无解组合: {no_solution_count} 个") print(f"总解答数: {total_solutions} 个") print(f"难度统计: 普通解法 {flag0_count} 个, 一星解法 {flag1_count} 个, 二星解法 {flag2_count} 个") return True except Exception as e: print(f"保存数据时出错: {e}") return False def migrate_data(): """将JSON数据迁移到SQLite数据库""" # 获取JSON数据文件路径 json_path = Path(__file__).parent / '24game_data.json' # 导入数据 try: db.import_json_data(json_path) print("数据迁移完成") except Exception as e: print(f"数据迁移失败: {e}") def main(): docx_path = './24点游戏所有组合答案修正版.docx' # 检查文件是否存在 if not os.path.exists(docx_path): print(f"错误: 文件不存在 {docx_path}") return start_time = time.time() print("正在提取数据,请稍候...") # 提取数据 data, no_solution_count = extract_data_from_docx(docx_path) # 检查是否成功提取数据 if not data: print("错误: 未能从文档中提取数据") return # 保存为JSON save_to_json(data, output_file, no_solution_count) migrate_data() end_time = time.time() print(f"\n处理完成,耗时 {end_time - start_time:.2f} 秒") print(f"数据已保存到: {output_file}") if __name__ == "__main__": main()