Hier ist mein Code: < /p>
from itertools import combinations
selected_columns = [
"Message_MessageBody_Header_",
"Message_FileSequenceNo",
"Message_MessageBody_ArticleInfo_BNo",
"EnvDate",
"EnvTime"
] # Replace with actual column names
total_count = df.count()
print(f"Total records in DataFrame: {total_count}")
missing_columns = [col for col in selected_columns if col not in df.columns]
if missing_columns:
print(f"Error: The following columns are missing in the DataFrame: {missing_columns}")
else:
print(f"Selected columns exist in the DataFrame: {selected_columns}")
found_primary_key = False
for r in range(2, len(selected_columns) + 1):
print(f"\nChecking {r}-column combinations...")
for combo in combinations(selected_columns, r):
print(f"\n
unique_count = df.select(*combo).distinct().count()
print(f"Unique count for {combo}: {unique_count}")
df.select(*combo).distinct().show(5, truncate=False)
if unique_count == total_count:
print(f"\n
found_primary_key = True
break # Stop once a valid key is found
if found_primary_key:
break
if not found_primary_key:
print("\n
< /code>
Problem:
Dieser Ansatz ist rechnerisch teuer, da wiederholt .Distinct (). count () Operationen auf großen Daten. schneller?>