# -*- coding: utf-8 -*-
"""
Created on Tue Oct 6 14:59:43 2015
"""
import pandas
import numpy as np
import seaborn
import scipy
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import sklearn.metrics
from sklearn import datasets
from sklearn.ensemble import ExtraTreesClassifier
from graphviz import Source
data = pandas.read_csv('data/gapminder.csv', low_memory=False)
data = data[["incomeperperson", "oilperperson", "co2emissions", "internetuserate",
"lifeexpectancy", "polityscore", "relectricperperson", "urbanrate", "employrate"]]
def SetColumns(cols):
for col in cols:
print(col)
data[col]=data[col].replace(' ', 0)
data[col] = data[col].astype(float, errors = 'raise')
#pandas.to_numeric(data[col], errors='coerce')
Columns = ["incomeperperson", "oilperperson", "co2emissions", "internetuserate",
"lifeexpectancy", "polityscore", "relectricperperson", "urbanrate", "employrate"]
SetColumns(Columns)
data['incomeperperson'] = pandas.to_numeric(data['incomeperperson'], errors='coerce')
data = data.dropna(subset=['incomeperperson'])
data['incomeperperson'] = pandas.cut(data.incomeperperson, [ 300, 800, 1300, 2500, 4000, 6000,10000,15000,30000,50000,100000,200000])
data['incomeperperson'] = data['incomeperperson'].astype('str')
data_clean=data.dropna()
Columns = Columns.remove('incomeperperson')
predictors = data_clean[["oilperperson", "co2emissions", "internetuserate",
"lifeexpectancy", "polityscore", "relectricperperson", "urbanrate", "employrate"]]
targets = data_clean.incomeperperson
pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, targets)
#Build model on training data
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier(n_estimators=25)
classifier=classifier.fit(pred_train,tar_train)
predictions=classifier.predict(pred_test)
sklearn.metrics.confusion_matrix(tar_test,predictions)
sklearn.metrics.accuracy_score(tar_test, predictions)
# fit an Extra Trees model to the data
model = ExtraTreesClassifier()
model.fit(pred_train,tar_train)
# display the relative importance of each attribute
print(model.feature_importances_)
"""
Running a different number of trees and see the effect
of that on the accuracy of the prediction
"""
trees=range(25)
accuracy=np.zeros(25)
for idx in range(len(trees)):
classifier=RandomForestClassifier(n_estimators=idx + 1)
classifier=classifier.fit(pred_train,tar_train)
predictions=classifier.predict(pred_test)
accuracy[idx]=sklearn.metrics.accuracy_score(tar_test, predictions)
plt.cla()
plt.plot(trees, accuracy)
Comments
Post a Comment