extract_24game_data.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. import docx, json, re, os, sys, time
  2. from pathlib import Path
  3. from core.db import db
  4. # 设置编码
  5. sys.stdout.reconfigure(encoding='utf-8')
  6. sys.stderr.reconfigure(encoding='utf-8')
  7. # 定义输出文件路径
  8. output_file = './24game_data.json'
  9. def extract_data_from_docx(docx_path):
  10. print(f"正在读取文档: {docx_path}")
  11. try:
  12. # 打开Word文档
  13. doc = docx.Document(docx_path)
  14. # 初始化数据结构
  15. data = []
  16. current_id = 0
  17. success_count = 0
  18. error_count = 0
  19. skipped_count = 0
  20. no_solution_count = 0
  21. # 遍历文档中的表格
  22. for table_index, table in enumerate(doc.tables):
  23. # 跳过表头行
  24. for row_index, row in enumerate(table.rows):
  25. if row_index == 0 or len(row.cells) < 2: # 跳过表头和格式不正确的行
  26. skipped_count += 1
  27. continue
  28. # 跳过表头行
  29. if row.cells[0].text.strip() == "" or "答案" in row.cells[0].text:
  30. skipped_count += 1
  31. continue
  32. # 获取单元格内容
  33. try:
  34. # 第一个单元格包含四个数字
  35. numbers_text = row.cells[0].text.strip()
  36. # 跳过表头或非数据行
  37. if "组合" in numbers_text or "答案" in numbers_text or len(numbers_text) < 3:
  38. skipped_count += 1
  39. continue
  40. # 尝试匹配格式:1, 2, 3, 4 或 1,2,3,4(中文逗号)
  41. numbers_match = re.search(r'(\d+)[,,\s]\s*(\d+)[,,\s]\s*(\d+)[,,\s]\s*(\d+)', numbers_text)
  42. # 尝试匹配格式:1 2 3 4(空格分隔)
  43. if not numbers_match:
  44. numbers_match = re.search(r'(\d+)\s+(\d+)\s+(\d+)\s+(\d+)', numbers_text)
  45. # 尝试匹配任意格式的四个数字
  46. if not numbers_match:
  47. numbers = re.findall(r'\d+', numbers_text)
  48. if len(numbers) >= 4:
  49. num1, num2, num3, num4 = map(int, numbers[:4])
  50. else:
  51. print(f"警告: 无法解析数字: {numbers_text}")
  52. error_count += 1
  53. continue
  54. else:
  55. num1 = int(numbers_match.group(1))
  56. num2 = int(numbers_match.group(2))
  57. num3 = int(numbers_match.group(3))
  58. num4 = int(numbers_match.group(4))
  59. # 第二个单元格包含解答
  60. solutions_text = row.cells[1].text.strip()
  61. # 创建新的组合
  62. current_id += 1
  63. combination = {
  64. 'id': current_id,
  65. 'n1': num1,
  66. 'n2': num2,
  67. 'n3': num3,
  68. 'n4': num4,
  69. 's': []
  70. }
  71. # 检查是否有解答
  72. no_solution_patterns = ["无解", "无解答", "无解答案", "无", "无解★", "无解*", "无解*"]
  73. has_solution = True
  74. for pattern in no_solution_patterns:
  75. if pattern in solutions_text:
  76. has_solution = False
  77. no_solution_count += 1
  78. break
  79. # 如果有解答,解析它们
  80. if solutions_text and has_solution:
  81. # 分割多个解答(根据图片中的格式,解答可能用分号、分号+空格或其他分隔符分隔)
  82. solutions = re.split(r'[;;]\s*|\n|\s+(?=\d+[.、))])', solutions_text)
  83. for i, sol in enumerate(solutions, 1):
  84. sol = sol.strip()
  85. if not sol:
  86. continue
  87. # 移除可能的序号前缀,如 "1." 或 "(1)"
  88. sol = re.sub(r'^\d+[.、))]\s*', '', sol)
  89. # 检查星号数量来确定flag值
  90. flag = 0
  91. # 处理不同类型的星号字符
  92. if "@@" in sol :
  93. flag = 2
  94. elif '#' in sol :
  95. flag = 1
  96. # 移除所有类型的星号
  97. sol = sol.replace("@", "").replace("#", "").replace("*", "").replace("(","(").replace(")",")").strip()
  98. # 添加解答
  99. solution = {
  100. # 'q': current_id, # 使用当前数据的ID
  101. 'c': sol,
  102. 'f': flag
  103. }
  104. combination['s'].append(solution)
  105. # 添加到数据列表
  106. data.append(combination)
  107. success_count += 1
  108. except Exception as e:
  109. print(f"处理行时出错: {e}")
  110. error_count += 1
  111. continue
  112. print(f"数据提取完成: 成功 {success_count} 条, 错误 {error_count} 条, 跳过 {skipped_count} 条, 无解 {no_solution_count} 条")
  113. return data, no_solution_count
  114. except Exception as e:
  115. print(f"处理文档时出错: {e}")
  116. return [], 0
  117. def save_to_json(data, output_path, no_solution_count):
  118. print(f"正在保存数据到: {output_path}")
  119. try:
  120. # 创建输出目录
  121. os.makedirs(os.path.dirname(output_path), exist_ok=True)
  122. with open(output_path, 'w', encoding='utf-8') as f:
  123. for i, item in enumerate(data):
  124. json.dump(item, f, ensure_ascii=False, separators=(',', ':'))
  125. f.write('\n')
  126. # 统计解答数量
  127. total_solutions = sum(len(item['s']) for item in data)
  128. # 统计不同难度的解答
  129. flag0_count = sum(1 for item in data for sol in item['s'] if sol['f'] == 0)
  130. flag1_count = sum(1 for item in data for sol in item['s'] if sol['f'] == 1)
  131. flag2_count = sum(1 for item in data for sol in item['s'] if sol['f'] == 2)
  132. # 输出完整统计信息
  133. print("\n统计信息汇总:")
  134. print(f"总组合数: {len(data)} 个")
  135. print(f"有解组合: {len(data) - no_solution_count} 个")
  136. print(f"无解组合: {no_solution_count} 个")
  137. print(f"总解答数: {total_solutions} 个")
  138. print(f"难度统计: 普通解法 {flag0_count} 个, 一星解法 {flag1_count} 个, 二星解法 {flag2_count} 个")
  139. return True
  140. except Exception as e:
  141. print(f"保存数据时出错: {e}")
  142. return False
  143. def migrate_data():
  144. """将JSON数据迁移到SQLite数据库"""
  145. # 获取JSON数据文件路径
  146. json_path = Path(__file__).parent / '24game_data.json'
  147. # 导入数据
  148. try:
  149. db.import_json_data(json_path)
  150. print("数据迁移完成")
  151. except Exception as e:
  152. print(f"数据迁移失败: {e}")
  153. def main():
  154. docx_path = './24点游戏所有组合答案修正版.docx'
  155. # 检查文件是否存在
  156. if not os.path.exists(docx_path):
  157. print(f"错误: 文件不存在 {docx_path}")
  158. return
  159. start_time = time.time()
  160. print("正在提取数据,请稍候...")
  161. # 提取数据
  162. data, no_solution_count = extract_data_from_docx(docx_path)
  163. # 检查是否成功提取数据
  164. if not data:
  165. print("错误: 未能从文档中提取数据")
  166. return
  167. # 保存为JSON
  168. save_to_json(data, output_file, no_solution_count)
  169. migrate_data()
  170. end_time = time.time()
  171. print(f"\n处理完成,耗时 {end_time - start_time:.2f} 秒")
  172. print(f"数据已保存到: {output_file}")
  173. if __name__ == "__main__":
  174. main()