OpenAI GPT-3 Token LogProbs und Word-Level Surpur
Posted: 20 May 2025, 17:08
Ich versuche, überraschige Werte auf Wortebene zu berechnen. In:
[*] nehme ich meinen Kaffee mit Creme und Hund
Das Wort „Hund“ sollte ein hohes Überraschern ergeben (weil unerwartet/unvorhersehbar). Überraschend, weil es eine vorhersehbarere Fertigstellung ist. .TXT-Datei mit dem von Sternchen flankierten Zielwort (z. B. Make-up ). 3. < /strong> extrahieren Sie die LogProbs für die Token in der Zielspanne. Wörter: https://osf.io/hqwer
Ich glaube nichtimport os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import argparse
from transformers import AutoTokenizer
import numpy as np
from openai import OpenAI
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
def parse_args():
parser = argparse.ArgumentParser(description='Calculates surprisal and other \
metrics (in development) of transformers language models')
parser.add_argument('--stimuli', '-i', type=str,
help='Stimuli to test.')
parser.add_argument('--stimuli_list', '-ii', type=str,
help='Path to file containing list of stimulus files to test.')
parser.add_argument('--output_directory','-o', type=str, required = True,
help='Output directory.')
parser.add_argument('--model','-m', type=str,
help='The name of the GPT-3 model to run.')
parser.add_argument('--model_list','-mm', type=str,
help='Path to file with a list of GPT-3 models to run.')
args = parser.parse_args()
return args
def process_args(args):
try:
output_directory = args.output_directory
except:
print("Error: Please specify a valid output directory.")
if not os.path.exists(output_directory):
try:
os.makedirs(output_directory)
except:
print("Error: Cannot create output directory (Note: output directory does not already exist).")
if args.model_list:
try:
assert os.path.exists(args.model_list)
with open(args.model_list, "r") as f:
model_list = f.read().splitlines()
except:
print("Error: 'model_list' argument does not have a valid path. Trying to use individual specified model.")
try:
assert args.model
model_list = [args.model]
except:
print("Error: No model specified")
else:
try:
assert args.model
model_list = [args.model]
except:
print("Error: No model specified")
if args.stimuli_list:
try:
assert os.path.exists(args.stimuli_list)
with open(args.stimuli_list, "r") as f:
stimulus_file_list = f.read().splitlines()
except:
print("Error: 'stimuli_list' argument does not have a valid path. Trying to use individual stimulus set.")
try:
assert args.stimuli
stimulus_file_list = [args.stimuli]
except:
print("Error: No stimuli specified")
else:
try:
assert args.stimuli
stimulus_file_list = [args.stimuli]
except:
print("Error: No stimuli specified")
return(output_directory,model_list,stimulus_file_list)
def run_models(output_directory, model_list, stimulus_file_list):
tokenizer = AutoTokenizer.from_pretrained("gpt2")
for j in range(len(model_list)):
model_name = model_list[j]
model_name_cleaned = "gpt3" + model_name.replace("-", "_")
for i in range(len(stimulus_file_list)):
stimuli_name = stimulus_file_list.split('/')[-1].split('.')[0]
filename = f"{output_directory}/{stimuli_name}.surprisal.{model_name_cleaned}.causal.output"
with open(filename, "w") as f:
f.write("FullSentence\tSentence\tTargetWords\tSurprisal\tNumTokens\n")
with open(stimulus_file_list, 'r') as f:
stimulus_list = f.read().splitlines()
for j, stimulus in enumerate(stimulus_list):
try:
stimulus_spaces = stimulus.replace("*", "α").replace(" α", "α ")
encoded_stimulus = tokenizer.encode(stimulus_spaces)
if len(tokenizer.tokenize("aα")) == 2:
dummy_var_idxs = np.where(
(np.array(encoded_stimulus) == tokenizer.encode("α")[-1]) |
(np.array(encoded_stimulus) == tokenizer.encode("aα")[-1])
)[0]
preceding_context = encoded_stimulus[:dummy_var_idxs[0]]
target_words = encoded_stimulus[dummy_var_idxs[0] + 1 : dummy_var_idxs[1]]
following_words = encoded_stimulus[dummy_var_idxs[1] + 1 :]
stimulus_cleaned = stimulus.replace("*", "")
output = client.completions.create(
model=model_name,
prompt=stimulus_cleaned,
max_tokens=0, # Ensure we don't generate extra text
temperature=0,
n=1,
stream=False,
logprobs=1,
echo=True,
stop="\n",
)
print(f"
API Response for line {j+1}: {output}")
logprob_data = output.choices[0].logprobs
# token-level log probabilities
tokens = logprob_data.tokens
logprob_tokens = logprob_data.token_logprobs
if logprob_tokens is None or all(lp is None for lp in logprob_tokens): # Only flag if ALL logprobs are missing
print(f"
Missing logprobs for line {j+1}: {stimulus_list[j]}")
continue # Skip this stimulus if all logprobs are missing
# Extract only the target word's logprobs
surprisal_list = logprob_tokens[len(preceding_context) : len(preceding_context) + len(target_words)]
surprisal_list = np.nan_to_num(surprisal_list, nan=0.0) # Handle missing values
# Correct surprisal calculation
sentence = tokenizer.decode(preceding_context + target_words)
target_string = "".join(tokens[len(preceding_context) : len(preceding_context) + len(target_words)])
print(f"target words: {target_words}")
surprisal = -np.sum(surprisal_list) / np.log(2)
num_tokens = len(target_words)
with open(filename, "a") as f:
f.write(f"{stimulus.replace('*','')}\t{sentence}\t{target_string}\t{surprisal}\t{num_tokens}\n")
except Exception as e:
print(f"
Problem with stimulus on line {j+1}: {stimulus_list[j]}")
print(f"
Error: {e}") # Print the exact error message from OpenAI
print(f"
OpenAI Response: {output}") # Show what OpenAI actually returned
def main():
args = parse_args()
try:
output_directory, model_list, stimulus_file_list = process_args(args)
except Exception as e:
print(f"
Error: {e}")
return False
try:
run_models(output_directory,model_list,stimulus_file_list)
except:
print("Error: issue with stimuli, output directory, GPT-3 models chosen, or API key.")
if __name__ == "__main__":
main()
< /code>
Das Skript wird ohne Ausnahmen ausgeführt. Es (scheinbar) extrahiert und verarbeitet viele Zielwörter richtig ...
⸻
Probleme:
1. Falsches Token extrahiert - In einigen Fällen wird das falsche Token als Ziel identifiziert. Zum Beispiel in: '... Der Flughafen ...' Die Ausgangsprotokolle überraiser für "anstelle von" Flughafen ", obwohl das Ziel in der Eingabe korrekt markiert ist. Fehlende Werte: -Einige Zielwörter geben eine überraschende von -0,0 zurück, selbst wenn die Token korrekt identifiziert erscheinen. Zum Beispiel: < /p>
Jane legt ein paar Make-up < /em> auf. # Higher Surpur (!?)
Phil hat einige Make-up auf. # Niedrigere Überraise < /li>
< /ol>
Dies widerspricht dem, was wir erwarten würden-dass Make-up in Satz 3 vorhersehbarer sein sollte. Viele meiner Ziele sind mehrfach (z. B. "Make-up", "Schnabel", "Schokolade") und können nicht richtig behandelt werden. Danke!
[*] nehme ich meinen Kaffee mit Creme und Hund
Das Wort „Hund“ sollte ein hohes Überraschern ergeben (weil unerwartet/unvorhersehbar). Überraschend, weil es eine vorhersehbarere Fertigstellung ist. .TXT-Datei mit dem von Sternchen flankierten Zielwort (z. B. Make-up ). 3. < /strong> extrahieren Sie die LogProbs für die Token in der Zielspanne. Wörter: https://osf.io/hqwer
Ich glaube nichtimport os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import argparse
from transformers import AutoTokenizer
import numpy as np
from openai import OpenAI
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
def parse_args():
parser = argparse.ArgumentParser(description='Calculates surprisal and other \
metrics (in development) of transformers language models')
parser.add_argument('--stimuli', '-i', type=str,
help='Stimuli to test.')
parser.add_argument('--stimuli_list', '-ii', type=str,
help='Path to file containing list of stimulus files to test.')
parser.add_argument('--output_directory','-o', type=str, required = True,
help='Output directory.')
parser.add_argument('--model','-m', type=str,
help='The name of the GPT-3 model to run.')
parser.add_argument('--model_list','-mm', type=str,
help='Path to file with a list of GPT-3 models to run.')
args = parser.parse_args()
return args
def process_args(args):
try:
output_directory = args.output_directory
except:
print("Error: Please specify a valid output directory.")
if not os.path.exists(output_directory):
try:
os.makedirs(output_directory)
except:
print("Error: Cannot create output directory (Note: output directory does not already exist).")
if args.model_list:
try:
assert os.path.exists(args.model_list)
with open(args.model_list, "r") as f:
model_list = f.read().splitlines()
except:
print("Error: 'model_list' argument does not have a valid path. Trying to use individual specified model.")
try:
assert args.model
model_list = [args.model]
except:
print("Error: No model specified")
else:
try:
assert args.model
model_list = [args.model]
except:
print("Error: No model specified")
if args.stimuli_list:
try:
assert os.path.exists(args.stimuli_list)
with open(args.stimuli_list, "r") as f:
stimulus_file_list = f.read().splitlines()
except:
print("Error: 'stimuli_list' argument does not have a valid path. Trying to use individual stimulus set.")
try:
assert args.stimuli
stimulus_file_list = [args.stimuli]
except:
print("Error: No stimuli specified")
else:
try:
assert args.stimuli
stimulus_file_list = [args.stimuli]
except:
print("Error: No stimuli specified")
return(output_directory,model_list,stimulus_file_list)
def run_models(output_directory, model_list, stimulus_file_list):
tokenizer = AutoTokenizer.from_pretrained("gpt2")
for j in range(len(model_list)):
model_name = model_list[j]
model_name_cleaned = "gpt3" + model_name.replace("-", "_")
for i in range(len(stimulus_file_list)):
stimuli_name = stimulus_file_list.split('/')[-1].split('.')[0]
filename = f"{output_directory}/{stimuli_name}.surprisal.{model_name_cleaned}.causal.output"
with open(filename, "w") as f:
f.write("FullSentence\tSentence\tTargetWords\tSurprisal\tNumTokens\n")
with open(stimulus_file_list, 'r') as f:
stimulus_list = f.read().splitlines()
for j, stimulus in enumerate(stimulus_list):
try:
stimulus_spaces = stimulus.replace("*", "α").replace(" α", "α ")
encoded_stimulus = tokenizer.encode(stimulus_spaces)
if len(tokenizer.tokenize("aα")) == 2:
dummy_var_idxs = np.where(
(np.array(encoded_stimulus) == tokenizer.encode("α")[-1]) |
(np.array(encoded_stimulus) == tokenizer.encode("aα")[-1])
)[0]
preceding_context = encoded_stimulus[:dummy_var_idxs[0]]
target_words = encoded_stimulus[dummy_var_idxs[0] + 1 : dummy_var_idxs[1]]
following_words = encoded_stimulus[dummy_var_idxs[1] + 1 :]
stimulus_cleaned = stimulus.replace("*", "")
output = client.completions.create(
model=model_name,
prompt=stimulus_cleaned,
max_tokens=0, # Ensure we don't generate extra text
temperature=0,
n=1,
stream=False,
logprobs=1,
echo=True,
stop="\n",
)
print(f"
logprob_data = output.choices[0].logprobs
# token-level log probabilities
tokens = logprob_data.tokens
logprob_tokens = logprob_data.token_logprobs
if logprob_tokens is None or all(lp is None for lp in logprob_tokens): # Only flag if ALL logprobs are missing
print(f"
continue # Skip this stimulus if all logprobs are missing
# Extract only the target word's logprobs
surprisal_list = logprob_tokens[len(preceding_context) : len(preceding_context) + len(target_words)]
surprisal_list = np.nan_to_num(surprisal_list, nan=0.0) # Handle missing values
# Correct surprisal calculation
sentence = tokenizer.decode(preceding_context + target_words)
target_string = "".join(tokens[len(preceding_context) : len(preceding_context) + len(target_words)])
print(f"target words: {target_words}")
surprisal = -np.sum(surprisal_list) / np.log(2)
num_tokens = len(target_words)
with open(filename, "a") as f:
f.write(f"{stimulus.replace('*','')}\t{sentence}\t{target_string}\t{surprisal}\t{num_tokens}\n")
except Exception as e:
print(f"
print(f"
print(f"
def main():
args = parse_args()
try:
output_directory, model_list, stimulus_file_list = process_args(args)
except Exception as e:
print(f"
return False
try:
run_models(output_directory,model_list,stimulus_file_list)
except:
print("Error: issue with stimuli, output directory, GPT-3 models chosen, or API key.")
if __name__ == "__main__":
main()
< /code>
Das Skript wird ohne Ausnahmen ausgeführt. Es (scheinbar) extrahiert und verarbeitet viele Zielwörter richtig ...
⸻
Probleme:
1. Falsches Token extrahiert - In einigen Fällen wird das falsche Token als Ziel identifiziert. Zum Beispiel in: '... Der Flughafen ...' Die Ausgangsprotokolle überraiser für "anstelle von" Flughafen ", obwohl das Ziel in der Eingabe korrekt markiert ist. Fehlende Werte: -Einige Zielwörter geben eine überraschende von -0,0 zurück, selbst wenn die Token korrekt identifiziert erscheinen. Zum Beispiel: < /p>
Jane legt ein paar Make-up < /em> auf. # Higher Surpur (!?)
Phil hat einige Make-up auf. # Niedrigere Überraise < /li>
< /ol>
Dies widerspricht dem, was wir erwarten würden-dass Make-up in Satz 3 vorhersehbarer sein sollte. Viele meiner Ziele sind mehrfach (z. B. "Make-up", "Schnabel", "Schokolade") und können nicht richtig behandelt werden. Danke!