1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248
| from sparkai.llm.llm import ChatSparkLLM, ChunkPrintHandler from sparkai.core.messages import ChatMessage import pandas as pd import re import json import time
SPARKAI_URL = 'wss://spark-api.xf-yun.com/v3.5/chat'
SPARKAI_APP_ID = '' SPARKAI_API_SECRET = '' SPARKAI_API_KEY = ''
SPARKAI_DOMAIN = 'generalv3.5'
def call_sparkai(prompt): spark = ChatSparkLLM( spark_api_url=SPARKAI_URL, spark_app_id=SPARKAI_APP_ID, spark_api_key=SPARKAI_API_KEY, spark_api_secret=SPARKAI_API_SECRET, spark_llm_domain=SPARKAI_DOMAIN, streaming=False, ) messages = [ChatMessage( role="user", content=prompt )] handler = ChunkPrintHandler() try: a = spark.generate([messages], callbacks=[handler]) except Exception as e: print(e) return 'error' return a.generations[0][0].text
cankao_content = ''' 1. Which of the following is not a type of art form that Nick Smith uses in his pixelated collages? A. Painting B. Photography C. Embroidery D. Video art
Answer:C
2. What does the word "Psychology" in the title PSYCOLOURGY: January 2015 refer to in relation to Nick Smith's work? A. The study of human behavior and mental processes B. The concept of using colour to convey emotions and ideas C. The use of pixelated /image in his collages D. A specific series of artworks from 2015
Answer:B
3. Which of the following is true about Nick Smith's career as an artist? A. He has only worked in the fine arts category B. His work is primarily focused on interior design C. He has never used hand-made collages in his work D. His first collage experiment was inspired by Marilyn Monroe
Answer:D
4. Which of the following can be inferred about the text employed in Nick Smith's work? A. It is always narrative and sequential B. It is often open to interpretation by the viewer C. It is always written in a specific language or script D. It is always placed under each swatch of colour
Answer:B
'''
def get_questions(text): text = text.replace('\n', ' ')+' ' pattern = re.compile(r'(\d+\..*?)(A\..*?\s{2})([B-D]\..*?\s{2})([B-D]\..*?\s{2})(D\..*?\s{2})', re.DOTALL)
matches = pattern.findall(text)
questions_dict_list = []
for match in matches: question, option1, option2, option3, option4 = match pattern_question = re.compile(r'(\d+)\.(.*)') question_text = pattern_question.findall(question.strip())[0][1] options = {option1[0]: option1, option2[0]: option2, option3[0]: option3, option4[0]: option4} question_dict = { 'question': question_text, 'options': { 'A': options.get('A', '').strip(), 'B': options.get('B', '').strip(), 'C': options.get('C', '').strip(), 'D': options.get('D', '').strip() } } questions_dict_list.append(question_dict) return questions_dict_list
def remove_whitespace_and_newlines(input_string): result = input_string.replace(" ", "").replace("\n", "").replace(".", "") return result
def get_answers(text): text = remove_whitespace_and_newlines(text) pattern = re.compile(r'(\d)\s*([A-D])')
matches = pattern.findall(text) res = [] for match in matches: number_dot, first_letter = match res.append(first_letter) return res def get_prompt_en(text): prompt = f''' 你是⼀个⾼考选择题出题专家,你出的题有⼀定深度,你将根据阅读文本,出4道单项选择题,包含题目选项,以及对应的答案,注意:不⽤给出原文,每道题由1个问题和4个选项组成,仅存在1个正确答案,请严格按照要求执行。 The reading text is mainly in English. The questions and answers you raised need to be completed in English for at least the following points: ### 回答要求 (1)Understanding the main idea of the main idea. (2)Understand the specific information in the text. (3)infering the meaning of words and phrases from the context ### 阅读文本 {text} ''' return prompt
def get_adddata_prompt_zero(reading, cankao_content, question, answer): prompt = f'''你是一个高考英语阅读题出题专家,请阅读材料,需要参考参考内容 按照要求将题目、选项、答案对其补充完整。
###阅读材料 {reading}
###要求 1.需要将序号对应的题目与答案做匹配。 2.匹配后格式按照问题、ABCD四个选项顺序、答案的结构组合,按照参考内容格式输出。 3.如果选择题目数量不够四个请根据阅读材料及出题思路再生成题目,总题目达到四个。 4.题目中不能出现任何不合理的词汇、语法错误。 5.如果有简答题目与答案请忽略这部分内容,只处理选择题目。 6.题目编号从1开始。
###参考内容 {cankao_content}
###题目 {question}
###答案 {answer} ''' return prompt
success = 0 fail = 0
def call_sparkai_with_retry(prompt, retries=3, delay=5): for attempt in range(retries): try: resm = call_sparkai(prompt) return resm except TimeoutError: if attempt < retries - 1: print(f"请求超时,正在重试... ({attempt + 1}/{retries})") time.sleep(delay) else: print("请求超时,已达到最大重试次数。") return 'error'
def process_en(df): global success global fail res_input = [] res_output = [] for id in range(len(df)): data_options = df.loc[id, '选项'] data_answers = df.loc[id,'答案'] data_prompt = df.loc[id,'阅读文本'] prompt = get_adddata_prompt_zero(data_prompt, cankao_content, data_options, data_answers) resm = call_sparkai_with_retry(prompt) data_prompt = get_prompt_en(data_prompt) if resm == 'error': fail += 1 data_options = get_questions(data_options) data_answers = get_answers(data_answers) if(len(data_answers)==len(data_options)): res = '' for id,question in enumerate(data_options): res += f''' {id+1}.{question['question']} {question['options']['A']} {question['options']['B']} {question['options']['C']} {question['options']['D']} answer:{data_answers[id]} '''+'\n' res_output.append(res) res_input.append(data_prompt) else: success += 1 res_output.append(resm) res_input.append(data_prompt) return res_input,res_output
if __name__ == '__main__': df = pd.read_excel('训练集-英语.xlsx') df = df.replace('.', '.', regex=True).replace('А.', 'A.', regex=True).replace('В.', 'B.', regex=True).replace('С.', 'C.', regex=True).replace('D.', 'D.', regex=True) en_input,en_output = process_en(df) print("success:",success) print("fail:",fail) data = [] for i in range(len(en_input)): data.append({'input':en_input[i],'output':en_output[i]}) with open('output_en.json', 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=4)
|