Cleaning and Labelling Data with LLMs
by Jacob Dichter
June 3, 2025
A simple Python script to batch your descriptions and send them to ChatGPT via the OpenAI API.
import openai
import time
openai.api_key = "YOUR_API_KEY_HERE"
def batch_list(lst, n):
"""Yield successive n-sized chunks from lst."""
for i in range(0, len(lst), n):
yield lst[i:i + n]
def create_prompt(batch):
prompt = "Please rewrite each of the following commodity descriptions to be concise, natural, and easy to understand. Keep the meaning but simplify wording.\n\n"
for i, desc in enumerate(batch, 1):
prompt += f"{i}. {desc}\n"
prompt += "\nReturn your response as a numbered list with each simplified description."
return prompt
def call_openai_api(prompt):
response = openai.ChatCompletion.create(
model="gpt-4o-mini",
messages=[
{"role": "user", "content": prompt}
],
temperature=0.5,
max_tokens=500,
)
return response.choices[0].message.content
def main():
batch_size = 10
simplified_descriptions = []
for batch in batch_list(hs4_descriptions, batch_size):
prompt = create_prompt(batch)
print("Sending batch to API...")
simplified = call_openai_api(prompt)
print("Received response:")
print(simplified)
simplified_descriptions.append(simplified)
time.sleep(1) # Be polite and avoid hitting rate limits
# Optional: save results to a file
with open("simplified_descriptions.txt", "w") as f:
for batch_text in simplified_descriptions:
f.write(batch_text + "\n\n")
if __name__ == "__main__":
main()
What this does: Splits your 1229 descriptions into groups of 10.
Sends each batch to ChatGPT with a clear prompt.
Prints and saves the simplified responses.
tags: