Commit 07aaa16d by ccran

feat:llm数据生成

parent af61e996
{
"python-envs.defaultEnvManager": "ms-python.python:conda",
"python-envs.defaultPackageManager": "ms-python.python:conda",
"python-envs.pythonProjects": []
}
\ No newline at end of file
...@@ -11,3 +11,16 @@ ...@@ -11,3 +11,16 @@
## 4、训练 ## 4、训练
基于数据集进行SFT训练 基于数据集进行SFT训练
## 5、统一导出
合并 `merge_excel.py``merge_json.py` 的导出内容:
```bash
OPENAI_API_KEY=your_key python main.py --excel datasets/excel --json-dir datasets/json --out datasets/merged/merged_all.json
```
说明:`main.py` 会使用每条数据的 `prompt` 作为 system、`original_text` 作为 user 调用 OpenAI 接口;
解析返回 JSON 中的 `result``ground_truth` 对比,不一致会重试,默认最多 `10` 次。
可选参数:`--model``--max-retries``--openai-base-url`
依赖安装:`pip install openai tenacity pandas`
\ No newline at end of file
{
"交付时间审查": [
{
"review_item": "交付时间审查",
"original_text": "2025.09.30",
"suggest": "建议修改为:'技术协议签订后X天内完成交货'(具体天数需根据实际履约周期确定),或补充'方案确认后X天内交货'的表述,确保交货时间与前置条件挂钩。\n",
"__source_file__": "今麦郎合同审核.xlsx",
"ground_truth": "不合格",
"__source_type__": "excel",
"prompt": "你是乙方(供方、卖方)法律部门的合同审查助手 
# 交付时间审查
1、合同必须明确交货时间,且满足以下任一约定方式:
- 约定以“方案确认后XX天”作为交付时限
- 约定以“技术协议签订后XX天”作为交付时限
2、本审查只需输出一个审查结果项,如果有合格项,则优先输出合格项,如果全是不合格项,优先输出信息量最丰富最完整的不合格项
# 审查不合格建议
1、交付要提及方案确认或技术协议签订为起始时间点
# 审查约束
- 输出包括审查的句子、详情、结果、建议以及句子所在的合同分段
- 审查结果为合格/不合格,合格的审查结果无需输出建议
- 审查句子严格提取关键且详细的句子内容,保留换行、空格、标点符号等特殊字符
- 结果以JSON数组的格式返回,例如```json [{\"original_text\":\"审查句子\",\"details\":\"审查详情\",\"result\":\"不合格\",\"suggest\":\"审查建议\"}]```
依据审查要点,遵循审查约束,完成合同审查,一步步仔细思考。"
}
],
"安装调试与指导审查": [
{
"review_item": "安装调试与指导审查",
"original_text": "为向定作人提供合格的满足本工程正常运行所需的产品,承揽人还应向定作人及时提供所有与之相关的技术服务,包括但不限于:承揽人根据附件2《技术协议》和本合同规定负责产品的设计、制造、运输、安装和调试指导、试车、性能考核和验收、质保期内问题处理和技术服务。",
"suggest": "删除'指导'相关表述,修改为'调试配合'或'技术支持',例如:'负责产品的设计、制造、运输、安装和调试配合、试车、性能考核和验收'。\n\n",
"__source_file__": "今麦郎合同审核.xlsx",
"ground_truth": "不合格",
"__source_type__": "excel",
"prompt": "你是乙方(供方、卖方)法律部门的合同审查助手 
# 安装调试与指导审查
1、针对产品的安装/安装调试/安装指导/指导/技术服务,主体为乙方,动作为提供/负责/参与/指导/进行/培训,审查不合格;如果主体是甲方,或者动作为配合/协助/支持,审查合格
# 审查不合格建议
1、协商修改,修改安装调试条款为配合/协助
2、删除指导相关条款或者修改为配合、技术支持
# 审查约束
- 输出包括审查的句子、详情、结果、建议以及句子所在的合同分段
- 审查结果为合格/不合格,合格的审查结果无需输出建议
- 审查句子严格提取关键且详细的句子内容,保留换行、空格、标点符号等特殊字符
- 结果以JSON数组的格式返回,例如```json [{\"original_text\":\"审查句子\",\"details\":\"审查详情\",\"result\":\"不合格\",\"suggest\":\"审查建议\"}]```
依据审查要点,遵循审查约束,完成合同审查,一步步仔细思考。"
}
]
}
\ No newline at end of file
from __future__ import annotations
import json_repair
import argparse
import json
import os
from pathlib import Path
from typing import Any
import pandas as pd
from openai import OpenAI
from tenacity import Retrying, stop_after_attempt, wait_fixed
from merge_excel import read_excel_rows_from_path
from merge_json import apply_field_mapping, read_json_lists_from_dir
base_url = 'http://192.168.252.71:9002/v1'
model= 'Qwen2-72B-Instruct'
key = 'none'
# base_url = 'http://192.168.252.71:18088/api/v1'
# model = 'Qwen2-72B-Instruct'
# key = 'fastgpt-eSZPKebnYpOrsm2jsA6iithc81AseS7nknOjOavElp4s57HrTYSo8ZXW'
REVIEW_ITEM_MAP: dict[str, str] = {
"发票审查-csc": "发票审查",
"发票审查-merge": "发票审查",
"价款内容审查-merge": "价款内容审查",
"运输保险审查-merge": "运输保险审查",
"变更取消责任审查-csc": "变更取消责任审查",
"安装调试审查": "安装调试与指导审查",
"标的物内容审查": "标的物审查",
"技术指导审查": "安装调试与指导审查",
"履行义务审查": "第三方审查",
"三方贷款审查": "第三方审查",
"三方货款审查": "第三方审查",
"违约审查": "违约与延期审查",
"违约条款审查": "违约与延期审查",
"延期审查": "违约与延期审查",
"责任规定审查": "违约与延期审查",
}
def normalize_review_item(review_item: Any) -> str:
review_item_text = str(review_item).strip()
mapped_review_item = REVIEW_ITEM_MAP.get(review_item_text)
if mapped_review_item is not None:
return mapped_review_item
if review_item_text.endswith(".json"):
review_item_text = review_item_text[:-5]
mapped_review_item = REVIEW_ITEM_MAP.get(review_item_text)
if mapped_review_item is not None:
return mapped_review_item
return review_item_text
def merge_export_rows(
excel_path: str | Path,
json_dir: str | Path,
sheet_name: str | int | None = 0,
) -> list[dict[str, Any]]:
excel_rows = read_excel_rows_from_path(excel_path, sheet_name=sheet_name)
for row in excel_rows:
row["__source_type__"] = "excel"
json_rows = read_json_lists_from_dir(json_dir)
json_rows = apply_field_mapping(json_rows)
for row in json_rows:
row["__source_type__"] = "json"
return excel_rows + json_rows
def build_review_item_mapping_from_sheet(
excel_path: str | Path,
sheet_name: str = "提取审查",
) -> dict[str, Any]:
path = Path(excel_path)
files = sorted(path.glob("*.xlsx")) if path.is_dir() else [path]
mapping: dict[str, Any] = {}
for file_path in files:
try:
df = pd.read_excel(file_path, sheet_name=sheet_name)
except Exception:
continue
if df.empty or len(df.columns) < 2:
continue
key_col = "审查要点" if "审查要点" in df.columns else df.columns[0]
value_col = "审查提示词" if "审查提示词" in df.columns else df.columns[1]
for _, row in df.iterrows():
key = row.get(key_col)
if pd.isna(key):
continue
value = row.get(value_col)
if pd.isna(value):
continue
mapping[str(key).strip()] = value
return mapping
def apply_review_item_mapping(rows: list[dict[str, Any]], mapping: dict[str, Any]) -> None:
unmatched_review_items: set[str] = set()
for row in rows:
review_item = row.get("review_item")
if review_item is None:
unmatched_review_items.add("<None>")
continue
review_item_text = normalize_review_item(review_item)
row["review_item"] = review_item_text
mapped_value = mapping.get(review_item_text)
if mapped_value is not None:
row["prompt"] = mapped_value
else:
unmatched_review_items.add(review_item_text)
if unmatched_review_items:
print(f"[mapping_unmatched_dedup] review_items={sorted(unmatched_review_items)}")
def group_rows_by_review_item_with_dedup(
rows: list[dict[str, Any]],
) -> dict[str, list[dict[str, Any]]]:
grouped_rows: dict[str, list[dict[str, Any]]] = {}
seen_original_texts: dict[str, set[Any]] = {}
for row in rows:
review_item = row.get("review_item")
review_item_key = "<None>" if review_item is None else str(review_item).strip()
if review_item_key not in grouped_rows:
grouped_rows[review_item_key] = []
seen_original_texts[review_item_key] = set()
original_text = row.get("original_text")
original_text_key = original_text.strip() if isinstance(original_text, str) else original_text
if original_text_key in seen_original_texts[review_item_key]:
continue
seen_original_texts[review_item_key].add(original_text_key)
grouped_rows[review_item_key].append(row)
return dict(sorted(grouped_rows.items(), key=lambda item: item[0]))
def print_grouped_review_item_stats(grouped_rows: dict[str, list[dict[str, Any]]]) -> None:
total_rows = sum(len(group_rows) for group_rows in grouped_rows.values())
print(f"grouped_review_item_count={len(grouped_rows)}")
print(f"grouped_total_rows={total_rows}")
for review_item, group_rows in grouped_rows.items():
print(f"[group_stats] review_item={review_item}, count={len(group_rows)}")
def call_openai_chat_completion(
client: OpenAI,
model: str,
system_prompt: str,
user_text: str,
) -> str:
extra_body = None
if system_prompt:
extra_body = {
'variables': {
'system': system_prompt
}
}
resp = client.chat.completions.create(
model=model,
temperature=1.2,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_text},
],
extra_body=extra_body
)
content = resp.choices[0].message.content if resp.choices else None
reasoning_content = resp.choices[0].message.reasoning_content if resp.choices else None
if not isinstance(content, str) or not content.strip():
raise ValueError("OpenAI响应缺少message.content")
return reasoning_content,content
def filter_rows_by_llm_result(
rows: list[dict[str, Any]],
client: OpenAI,
model: str,
max_retries: int = 10,
) -> list[dict[str, Any]]:
kept_rows: list[dict[str, Any]] = []
for index, row in enumerate(rows, start=1):
system_prompt = row.get("prompt")
user_text = row.get("original_text")
ground_truth = row.get("ground_truth")
if not isinstance(system_prompt, str) or not system_prompt.strip():
continue
if not isinstance(user_text, str) or not user_text.strip():
continue
if ground_truth is None:
continue
ground_truth_text = str(ground_truth).strip()
ground_truth_text = '合格'
try:
for attempt in Retrying(wait=wait_fixed(1), stop=stop_after_attempt(max_retries), reraise=True):
with attempt:
reasoning_content,content = call_openai_chat_completion(
client=client,
model=model,
system_prompt=system_prompt,
user_text=user_text,
)
result_text = json_repair.loads(content)
if result_text is None:
print(f"[llm_parse_failed] row={index}, attempt={attempt.retry_state.attempt_number}")
raise ValueError("llm response json parse failed")
final_result = result_text[0].get("result")
if final_result != ground_truth_text:
print(
f"[llm_mismatch] row={index}, attempt={attempt.retry_state.attempt_number}, "
f"original_text={user_text},ground_truth={ground_truth_text},result={result_text}"
)
raise ValueError("llm result mismatch")
print(f"[llm_match] row={index}, attempt={attempt.retry_state.attempt_number},result={result_text}")
row['rsp'] = f'<think>{reasoning_content}</think>{result_text}'
except Exception as exc:
row['rsp'] = ''
print(f"[llm_drop] row={index}, error={exc}")
finally:
kept_rows.append(row)
print(f"llm_filtered_rows={len(kept_rows)}/{len(rows)}")
return kept_rows
def main() -> None:
parser = argparse.ArgumentParser(description="合并Excel与JSON导出内容")
parser.add_argument("--know", default="prompts/know.xlsx", help="提示词目录路径")
parser.add_argument("--excel", default="datasets/excel", help="Excel文件路径或目录")
parser.add_argument("--json-dir", default="datasets/json", help="JSON目录路径")
parser.add_argument("--sheet", default=0, help="sheet名称或索引,默认0")
parser.add_argument("--model", default=model, help="OpenAI模型名称")
parser.add_argument("--max-retries", type=int, default=10, help="每条数据最大重试次数")
parser.add_argument("--openai-base-url", default=base_url, help="OpenAI接口基础URL")
parser.add_argument("--out", default="datasets/merged/merged_all.json", help="输出JSON文件路径")
args = parser.parse_args()
sheet: str | int | None
if isinstance(args.sheet, str) and args.sheet.isdigit():
sheet = int(args.sheet)
else:
sheet = args.sheet
rows = merge_export_rows(args.excel, args.json_dir, sheet_name=sheet)
mapping = build_review_item_mapping_from_sheet(args.know, sheet_name="提取审查")
apply_review_item_mapping(rows, mapping)
api_key = os.getenv("OPENAI_API_KEY", key).strip()
if not api_key:
raise ValueError("缺少环境变量 OPENAI_API_KEY")
client = OpenAI(api_key=api_key, base_url=args.openai_base_url)
# rows = rows[:2]
rows = filter_rows_by_llm_result(
rows=rows,
client=client,
model=args.model,
max_retries=max(1, args.max_retries),
)
grouped_rows = group_rows_by_review_item_with_dedup(rows)
out_path = Path(args.out)
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text(json.dumps(grouped_rows, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"len(rows)={len(rows)}")
print(f"mapped_review_item_count={len(mapping)}")
print_grouped_review_item_stats(grouped_rows)
print(f"saved to: {out_path}")
if __name__ == "__main__":
main()
...@@ -42,7 +42,7 @@ def read_json_lists_from_dir(json_dir: str | Path) -> list[dict[str, Any]]: ...@@ -42,7 +42,7 @@ def read_json_lists_from_dir(json_dir: str | Path) -> list[dict[str, Any]]:
for file_path in sorted(path.glob("*.json")): for file_path in sorted(path.glob("*.json")):
rows = read_json_list(file_path) rows = read_json_list(file_path)
for row in rows: for row in rows:
row['review_item'] = file_path.name.split('-')[0] row['review_item'] = file_path.name.split('.')[0]
row["__source_file__"] = file_path.name row["__source_file__"] = file_path.name
all_rows.extend(rows) all_rows.extend(rows)
...@@ -87,7 +87,7 @@ def main() -> None: ...@@ -87,7 +87,7 @@ def main() -> None:
print(f'len(rows)={len(rows)}') print(f'len(rows)={len(rows)}')
# print(rows[0]) # print(rows[0])
show_ds(rows) # show_ds(rows)
if __name__ == "__main__": if __name__ == "__main__":
......
No preview for this file type
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment