模型訓練的常用步驟進行個人筆記,供個人參考
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import joblib
df = sns.load_dataset("tips")
df["log_tip"] = np.log(df["tip"])
sns.kdeplot(x="log_tip",data=df)
pal = sns.hls_palette(len(df["day"].unique()),l=.8,s=1) # l亮度,s飽和度
sns.barplot(x="sex",y="tip",data=df,hue="day",palette= pal,width=.8,legend="auto")
# upper left搭配borderaxespad=0,bbox_to_anchor=(1.02, 1),legend即不會在主window裡
plt.legend(loc='upper left', title='day',borderaxespad=0,bbox_to_anchor=(1.02, 1)) # sns竟靠plt改變legend位置
對文字進行轉換
tip_vars=['sex','smoker','day','time']
for var in tip_vars:
tip_list=var+"_"
tip_list = pd.get_dummies(df[var], prefix=var,dtype=int,drop_first=True)
data1=df.join(tip_list)
df=data1
data = df
to_keep=[i for i in data_vars if i not in tip_vars]
data_final=data[to_keep]
data_final.isnull().sum() #有缺失值時要進行處理
data= data_final
X = data.drop(["tip","log_tip"], axis=1).values
y = data.tip.values
# 資料分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
scaler = StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test= scaler.transform(X_test)
lr = LinearRegression()
lr.fit(X=X_train,y=y_train)
y_pred = lr.predict(X_test)
print(f'R2:{r2_score(y_test, y_pred):.2f}, MSE:{mean_squared_error(y_test, y_pred):.2f}')
joblib.dump(lr,'model.joblib')
joblib.dump(scaler,'scaler.joblib')
後續引用
clf = joblib.load('路徑\\model.joblib')
scaler = joblib.load('路徑\\scaler.joblib')
X_new = scaler.transform(X_new)
clf.predict(X_new)
打雜打久了,就變成打雜妹
程式寫久了,就變成老乞丐