Mein Ausbilder möchte, dass ich ein Regressionsmodell mache. Br /> Code (y mit beliebigem Rauschen, 97,8% R2_Score für jede 2 Cluster, wurde aber abgelehnt): < /p>
Code: Select all
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
\# loading data rest_data_path = '서울시 휴게음식점 인허가 정보.csv'
one_person_data_path = '1인가구(연령별).csv'
rest_data = pd.read_csv(rest_data_path, encoding='utf-8')
one_person_data = pd.read_csv(one_person_data_path, encoding='utf-8')
# preprocessing data
one_person_data = one_person_data.rename(columns={'자치구별(2)': '자치구'})
one_person_data_cleaned = one_person_data[one_person_data['자치구'] != '자치구별(2)']
one_person_data_cleaned['2023_합계'] = one_person_data_cleaned.loc[:, '2023':'2023.15'].apply(
pd.to_numeric, errors='coerce').sum(axis=1)
one_person_summary = one_person_data_cleaned[['자치구', '2023_합계']]
rest_data_cleaned = rest_data.rename(columns={'지번주소': '주소'})
rest_data_cleaned['자치구'] = rest_data_cleaned['주소'].str.split(' ').str[1]
merged_data = pd.merge(rest_data_cleaned, one_person_summary, on='자치구', how='left')
# add noise
np.random.seed(42)
noise = np.random.normal(0, 0.05 * merged_data['2023_합계'].std(), size=merged_data.shape[0])
merged_data['y'] = merged_data['2023_합계'] + noise
# save data
result_continuous = merged_data[['자치구', '주소', '2023_합계', 'y']].dropna()
# output
result_continuous'
Code: Select all
자치구 주소 2023_합계 y
\5 Gwanak-gu, Seoul 1562-17 Bongcheon-dong, Gwanak-gu, Seoul 159036,0 158422,996610
\6 Gwanak-gu, Seoul 1562-17 Bongcheon-dong, Gwanak-gu, Sonderstadt 142454,0 146588,600626 Gwanak-gu, Seoul 142454.0 138756.390843
\46 Gwanak-gu 1519-22 Sillim-dong, Gwanak-gu, Seoul 159036.0 157829.983096
\... ... ... ... . ..
\142131 Donga Cheongsol Apartment, 808 Chang-dong, Dobong-gu, Seoul, Dobong-gu Raum 209, Einkaufsgebäude 46250,0 43593,821116
\142138 Gwanak-gu, Seoul 1458-4 Sillim-dong, Gwanak-gu, Seoul Centerville 13th 159036,0 157758,991278
\142139 Gwanak- gu, Seoul Gwanak-gu Sillim-dong 1458-4 Centerville 13th 142454,0 140637,617424 ak- gu Woohyung Building, 928-1 Bongcheon-dong, Gwanak-gu, Seoul 142454,0 142926,767535
\14162 Zeilen × 4 Spalten\
Code: Select all
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
columns_to_encode = ['자치구']
columns_to_exclude = ['2023_합계']
encoder = OneHotEncoder(sparse_output=False) # drop='first'로 다중공선성 방지
encoded_columns = encoder.fit_transform(result_continuous[columns_to_encode])
encoded_column_names = encoder.get_feature_names_out(columns_to_encode)
encoded_df = pd.DataFrame(encoded_columns, columns=encoded_column_names, index=result_continuous.index)
processed_data = result_continuous.drop(columns=columns_to_encode + columns_to_exclude)
final_data = pd.concat([processed_data, encoded_df], axis=1)
final_data['주소_hash'] = result_continuous['주소'].apply(hash)
final_data = final_data.drop(columns=['주소']) # 원래 주소 열 제거
y = np.log1p(final_data[['y']])
final_data.drop(['y'], axis=1, inplace=True)
scaler = StandardScaler()
final_data_scaled = scaler.fit_transform(final_data)
final_data_scaled, y
(array([[ 0.72281968, -0.72281968, 1.14388039],
[ 0,72281968, -0,72281968, 1,14388039],
[ 0.72281968, -0.72281968, -0.74799056],
...,
[ 0.72281968, -0.72281968, 0.60622321],
[ 0.72281968, -0.72281968, 0,14385426],
[ 0,72281968, -0,72281968, 0,14385426]]),`
y
5 11,973030
6 11,895392
18 11,961831
19 11.840482
46 11.969280
... ...
142131 10.682694
142138 11.968830
142139 11.853949
142142 11.975973
142143 11,870095
[14162 Zeilen x 1 Spalte]