I'm working with this dataset and I applied random forest to create a pricing model, but the accuracy of the model is getting too high, so I'm wary of anything wrong. Apparently train and test are different, so it was not to give such a high accuracy ... is there a mistake?
print (score2) and print (accu2):
0.9981901132115226
[0.99086244 0.99562853 0.99551529 0.9988478 0.99997931]
#Random forest
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,ExtraTreesRegressor,GradientBoostingRegressor,BaggingRegressor
rf = RandomForestRegressor()
#conjunto sem data nem id
df2 = df.drop(['date', 'id'], axis=1)
#tira o price do df2 e coloca em x
x = df2.drop(['price'], axis=1)
#coloca em y apenas o price
y = df2['price']
x_train, x_test = train_test_split(x,test_size=0.2, random_state=42)
y_train, y_test = train_test_split(y, test_size=0.2, random_state=42)
print(x_train.count())
print(x_test.count())
print(x_train.head(2))
print(x_test.head(2))
rf.fit(x_train,y_train)
score2 = rf.score(x_test,y_test)
accu2 = cross_val_score(rf,x_train,y_train,cv=5)
print("____ Random Forest Regressor____\n")
print(score2)
print(accu2)
This is also how I apply Gradient Boosting Regressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import explained_variance_score
#conjunto sem data nem id
df2 = df.drop(['date', 'id'], axis=1)
#tira o price do df2 e coloca em x
x = df2.drop(['price'], axis=1)
#coloca em y apenas o price
y = df2['price']
#x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=42)
x_train, x_test, y_train, y_test = cross_validation.train_test_split(x, y ,test_size=0.2)
gb = GradientBoostingRegressor(n_estimators=1000)
gb.fit(x_train,y_train)
score4 = gb.score(x_test,y_test)
pred = gb.predict(x_test)
exp_est = explained_variance_score(pred, y_test)
print("exp_est: ")
print(exp_est)
#accu4 = cross_val_score(gb,x_train,y_train,cv=5)
print("____ Gradient Boosting Regressor____\n")
print(score4)
print(accu4)
0.998862149174232
[0.99741288 0.9989814 0.99979751 0.99906217 0.9999443]