本菜鸟的第一次机器学习实战。。。。
1,加载相关工具包
## 基础工具 import numpy as np import pandas as pd import warnings import matplotlib import matplotlib.pyplot as plt import seaborn as sns from scipy.special import jn from IPython.display import display, clear_output import time warnings.filterwarnings('ignore') %matplotlib inline ## 模型预测的 from sklearn import linear_model from sklearn import preprocessing from sklearn.svm import SVR from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor ## 数据降维处理的 from sklearn.decomposition import PCA,FastICA,FactorAnalysis,SparsePCA import lightgbm as lgb import xgboost as xgb ## 参数搜索和评价的 from sklearn.model_selection import GridSearchCV,cross_val_score,StratifiedKFold,train_test_split from sklearn.metrics import mean_squared_error, mean_absolute_error
2,读入数据,并查看相关信息
data_train = pd.read_csv('./used_car_train.csv',sep=' ') data_test = pd.read_csv('./used_car_test.csv',sep=' ') data_train.describe() data_train.shape data_test.shape data_train.head() data_train.info() data_train.columns data_test.info()
3,对特征标签进行分析
data_train['price'].describe() print('偏值:{}'.format(data_train['price'].skew())) print('峰值:{}'.format(data_train['price'].kurt())) sns.distplot(data_train['price'])
#特征标签分析 data = pd.concat([data_train['price'],data_train['power']],axis=1) data.plot.scatter(x='power',y='price') data = pd.concat([data_train['price'],data_train['kilometer']],axis=1) data.plot.scatter(x='kilometer',y='price')
4,分开数值型和object型的数据
numerical_cols = data_train.select_dtypes(exclude='object').columns print(numerical_cols) categorical_cols = data_train.select_dtypes(include='object').columns print(categorical_cols)
画图对一些特征进行分析
X_data = data_train[feature_cols] Y_data = data_train['price'] X_test = data_test[feature_cols] print('X train shape:',X_data.shape) print('X test shape:',X_test.shape) data = data_train[['price','bodyType']] plt.subplots(figsize=(8,6)) sns.boxplot(x='bodyType',y='price',data=data) data = data_train[['price','power']] plt.subplots(figsize=(8,6)) sns.boxplot(x='power',y='price',data=data) data =data_train['kilometer'] bins =np.arange(0,20,2.5) plt.hist(data,bins=bins)
再查看相关性
cols = corrmat.nlargest(10, 'price')['price'].index # 数值最大的前十个 cm = np.corrcoef(data_train[cols].values.T) plt.figure(figsize = (8, 6)) sns.heatmap(cm, cbar = True, annot = True, square = True, fmt = '.2f', cmap = 'Blues', annot_kws = {'size': 10}, yticklabels = cols.values, xticklabels = cols.values)
缺失值处理
total_missing = data_train.isnull().sum().sort_values(ascending = False) percent = (data_train.isnull().sum() / len(data_train)).sort_values(ascending = False).round(3) missing_data = pd.concat([total_missing, percent], axis = 1, keys = ['Total', 'Percent']) missing_data.head()
from scipy.stats import norm from scipy import stats plt.figure(figsize = (8, 6)) sns.distplot(data_train['price'], fit = norm) (mu, sigma) = norm.fit(data_train['price']) print('mu = {:.2f} and sigma = {:.2f}'.format(mu, sigma)) plt.legend(['Normal dist. ($mu = $ {:.2f} and $sigma = $ {:.2f})'.format(mu, sigma)], loc = 'best') plt.ylabel('Frequency') plt.title('price distribution') fig = plt.figure(figsize = (8, 6)) stats.probplot(data_train['price'], plot = plt);
最后进行log变换
#对数变换log(1+x) #使用log1p比用log(x+1)更好 #data_train['price'] = np.log(data_train['price'].values+1) data_train['price'] = np.log1p(data_train['price'].values) (mu, sigma) = norm.fit(data_train['price']) print('mu = {:.2f} and sigma = {:.2f}'.format(mu, sigma)) plt.figure(figsize = (8, 6)) sns.distplot(data_train['price'], fit = norm) plt.legend(['Normal dist. ($mu = $ {:.2f} and $sigma$ = {:.2f})'.format(mu, sigma)], loc = 'best') plt.ylabel('Frequency') plt.title('SalePrice distribution') plt.figure(figsize = (8, 6)) stats.probplot(data_train['price'], plot = plt);
参考资料:(11条消息) 数据挖掘之房价预测任务_sanjianjixiang的博客-CSDN博客