Classification in Income per Person
Classification in Income per Person
Introduction
In gap minder data, Income per person is analyzed with the relation of oil consumption, Co2 emission, internet user rate an so on.
Because these features are related social welfare, there might be some correlation with income per person.
Getting and Preparing Data
oilperperson: Oil Consumption per capita
co2emissions: CO2 consumtion
internetuserate : Internet users (per 100 people)
lifeexpectancy : life expectancy at birth (years)
polityscore : subtracting an autocracy score from a democracy score.
relectricperperson: residential electricity consumption per person
urbanrate : urban population
employrate : Percentage of total population, age above 15, that has been employed
The target data is converted to 12 category and change its data type to string for tree classification.
Data Modelling
Result
The accuracy is %32.5. Which is pretty low. Either the model needs more features or it is completely wrong.
Tree Structure
Codes
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 6 14:59:43 2015
"""
import pandas
import numpy as np
import seaborn
import scipy
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import sklearn.metrics
from graphviz import Source
data = pandas.read_csv('data/gapminder.csv', low_memory=False)
data = data[["incomeperperson", "oilperperson", "co2emissions", "internetuserate",
"lifeexpectancy", "polityscore", "relectricperperson", "urbanrate", "employrate"]]
def SetColumns(cols):
for col in cols:
print(col)
data[col]=data[col].replace(' ', 0)
data[col] = data[col].astype(float, errors = 'raise')
#pandas.to_numeric(data[col], errors='coerce')
Columns = ["incomeperperson", "oilperperson", "co2emissions", "internetuserate",
"lifeexpectancy", "polityscore", "relectricperperson", "urbanrate", "employrate"]
SetColumns(Columns)
data['incomeperperson'] = pandas.to_numeric(data['incomeperperson'], errors='coerce')
data = data.dropna(subset=['incomeperperson'])
data['incomeperperson'] = pandas.cut(data.incomeperperson, [ 300, 800, 1300, 2500, 4000, 6000,10000,15000,30000,50000,100000,200000])
data['incomeperperson'] = data['incomeperperson'].astype('str')
data_clean=data.dropna()
Columns = Columns.remove('incomeperperson')
predictors = data_clean[["oilperperson", "co2emissions", "internetuserate",
"lifeexpectancy", "polityscore", "relectricperperson", "urbanrate", "employrate"]]
targets = data_clean.incomeperperson
pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, targets)
#Build model on training data
classifier=DecisionTreeClassifier()
classifier=classifier.fit(pred_train,tar_train)
predictions=classifier.predict(pred_test)
print(sklearn.metrics.confusion_matrix(tar_test,predictions))
print(sklearn.metrics.accuracy_score(tar_test, predictions))
#Displaying the decision tree
from sklearn import tree
#from StringIO import StringIO
from io import StringIO
#from StringIO import StringIO
from IPython.display import Image
out = StringIO()
tree.export_graphviz(classifier, out_file=out)
import pydotplus
graph=pydotplus.graph_from_dot_data(out.getvalue())
Image(graph.create_png())
dotfile = open("C:/TREES/dtree2.dot", 'w')
tree.export_graphviz(classifier, out_file = dotfile, feature_names = pred_test.columns)
dotfile.close()
graph = Source( tree.export_graphviz(classifier, out_file=None, feature_names=pred_test.columns))
graph.format = 'png'
graph.render('dtree_render',view=True)



Comments
Post a Comment