栏目分类:
子分类:
返回
文库吧用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
文库吧 > IT > 软件开发 > 后端开发 > Python

二手车交易价格预测(一)

Python 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

二手车交易价格预测(一)

本菜鸟的第一次机器学习实战。。。。

1,加载相关工具包

## 基础工具
import numpy as np
import pandas as pd
import warnings
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.special import jn
from IPython.display import display, clear_output
import time

warnings.filterwarnings('ignore')
%matplotlib inline

## 模型预测的
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor

## 数据降维处理的
from sklearn.decomposition import PCA,FastICA,FactorAnalysis,SparsePCA

import lightgbm as lgb
import xgboost as xgb

## 参数搜索和评价的
from sklearn.model_selection import GridSearchCV,cross_val_score,StratifiedKFold,train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

 2,读入数据,并查看相关信息

data_train = pd.read_csv('./used_car_train.csv',sep=' ')
data_test = pd.read_csv('./used_car_test.csv',sep=' ')
data_train.describe()
data_train.shape
data_test.shape
data_train.head()
data_train.info()
data_train.columns
data_test.info()

3,对特征标签进行分析

data_train['price'].describe()
print('偏值:{}'.format(data_train['price'].skew()))
print('峰值:{}'.format(data_train['price'].kurt()))
sns.distplot(data_train['price'])

 

 

#特征标签分析
data = pd.concat([data_train['price'],data_train['power']],axis=1)
data.plot.scatter(x='power',y='price')
data = pd.concat([data_train['price'],data_train['kilometer']],axis=1)
data.plot.scatter(x='kilometer',y='price')

 4,分开数值型和object型的数据

 

numerical_cols = data_train.select_dtypes(exclude='object').columns
print(numerical_cols)
categorical_cols = data_train.select_dtypes(include='object').columns
print(categorical_cols)

 

 画图对一些特征进行分析

X_data = data_train[feature_cols]
Y_data = data_train['price']

X_test = data_test[feature_cols]

print('X train shape:',X_data.shape)
print('X test shape:',X_test.shape)

data = data_train[['price','bodyType']]
plt.subplots(figsize=(8,6))
sns.boxplot(x='bodyType',y='price',data=data)

data = data_train[['price','power']]
plt.subplots(figsize=(8,6))
sns.boxplot(x='power',y='price',data=data)

data =data_train['kilometer']
bins =np.arange(0,20,2.5)

plt.hist(data,bins=bins)

 再查看相关性

cols = corrmat.nlargest(10, 'price')['price'].index # 数值最大的前十个
cm = np.corrcoef(data_train[cols].values.T)
plt.figure(figsize = (8, 6))
sns.heatmap(cm, cbar = True, annot = True, square = True, fmt = '.2f', cmap = 'Blues',
            annot_kws = {'size': 10}, yticklabels = cols.values, xticklabels = cols.values)

 

 缺失值处理

total_missing = data_train.isnull().sum().sort_values(ascending = False)
percent = (data_train.isnull().sum() / len(data_train)).sort_values(ascending = False).round(3)
missing_data = pd.concat([total_missing, percent], axis = 1, keys = ['Total', 'Percent'])
missing_data.head()

 

from scipy.stats import norm
from scipy import stats

plt.figure(figsize = (8, 6))
sns.distplot(data_train['price'], fit = norm)

(mu, sigma) = norm.fit(data_train['price'])
print('mu = {:.2f} and sigma = {:.2f}'.format(mu, sigma))

plt.legend(['Normal dist. ($mu = $ {:.2f} and $sigma = $ {:.2f})'.format(mu, sigma)], loc = 'best')
plt.ylabel('Frequency')
plt.title('price distribution')

fig = plt.figure(figsize = (8, 6))
stats.probplot(data_train['price'], plot = plt);

最后进行log变换

#对数变换log(1+x)
#使用log1p比用log(x+1)更好
#data_train['price'] = np.log(data_train['price'].values+1)
data_train['price'] = np.log1p(data_train['price'].values)
(mu, sigma) = norm.fit(data_train['price'])
print('mu = {:.2f} and sigma = {:.2f}'.format(mu, sigma))

plt.figure(figsize = (8, 6))
sns.distplot(data_train['price'], fit = norm)
plt.legend(['Normal dist. ($mu = $ {:.2f} and $sigma$ = {:.2f})'.format(mu, sigma)], loc = 'best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

plt.figure(figsize = (8, 6))
stats.probplot(data_train['price'], plot = plt);

 

 参考资料:(11条消息) 数据挖掘之房价预测任务_sanjianjixiang的博客-CSDN博客

转载请注明:文章转载自 www.wk8.com.cn
本文地址:https://www.wk8.com.cn/it/280164.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 wk8.com.cn

ICP备案号:晋ICP备2021003244-6号