Sunday, 20 July 2014

Random Forest implementation using Python Sklearn library

Python is wonderful programming language. The more I explore, the more I am amazed about the power of this language.

Below is a recent Random forest implementation using Pandas and Sklearn libraries.

# -*- coding: utf-8 -*-
"""
Spyder Editor

This is Random forest implementation.
"""


import numpy as np
import csv as csv
import scipy as sp
import pandas as pandas
import matplotlib as mpl
import re


print "import successful"

csv_object = csv.reader(open("C:\\train.csv"), delimiter = ",")

# skipping the header
header = csv_object.next()

data = []
count = 0
for row in csv_object:
count +=1
data.append(row)
data = np.array(data)

#print data[0:15,5]
#print type(data[0::,5])

# Importing the pandas module to replace missing values

df = pd.read_csv("C:\\train.csv", header = 0)


# Extracting Salutation from Names

Salutation = ['Rev.', 'Mrs.', 'Miss.', 'Master.', 'Mr.']
#Salutation = ['Rev.']

df['Salutation'] = 100


pattern = ''

for i in xrange(0,len(Salutation)):
pattern = Salutation[i]
print "pattern is",pattern
# print df[df['Name'].str.contains(pattern)]['Salutation'].replace(False, i)
#df['Salutation']= df['Name'].str.contains(pattern).replace('False', i)
df.loc[(df['Name'].str.contains(pattern)), 'Salutation'] = i
#print pattern, df['Salutation']
print df.head(2)

print df[(df.Salutation != 100)][['Name', 'Salutation']]


df['Gender'] = df.Sex.map({'female':0 , 'male' : 1}). astype(int)
#print df[(df.Gender == 1) & (df.Pclass == 1)]

# Replace the ages in the dataset

median_ages = np.zeros((2,3))

for i in range(0,2):
for j in range(0,3):
median_ages[i,j] = df[(df.Gender == i) & (df.Pclass == j+1)]['Age'].dropna().median()

#print median_ages

df['AgeFill'] = df['Age']
#print df[df.Age.isnull()][['Age', 'AgeFill', 'Pclass', 'Sex', 'Survived']]

for i in range(0,2):
for j in range(0,3):
df.loc[(df.Age.isnull()) & (df.Gender == i) & (df.Pclass == j+1) , 'AgeFill'] = median_ages[i,j]

df['EmbarkedCode'] = df.Embarked.map({'C':0, 'S' : 1, 'Q':2})

#print df[df.Fare.isnull()][['Age', 'AgeFill', 'Pclass', 'Sex', 'Survived']]
#print df[df.Age.isnull()][['Age', 'AgeFill', 'Pclass', 'Sex', 'Survived', 'EmbarkedCode']]

#Convert the integer dataframe into numpy arraay

df = df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Age'], axis = 1)
#for i in range(0,3):
# print i, len(df[(df.EmbarkedCode == i)])

df.loc[df.EmbarkedCode.isnull()] = 1
#print df[df.EmbarkedCode.isnull()][['PassengerId']]
#print df.head(1)
#df = df.reindex(columns = pandas.Index(['Survived']).append(df.columns - ['Survived']))
df = df[['Survived', 'PassengerId', 'Pclass', 'SibSp', 'Parch', 'Fare', 'Gender', 'AgeFill', 'EmbarkedCode', 'Salutation']]
print df.head(2)
cols = df.columns.tolist()


train_data = df.values
print "Train data sample"
#print train_data[0::,1::]
#print train_data[0::,0]



# Start with Random Forest implementation

from sklearn.ensemble import RandomForestClassifier

# Initiate the random forest object

forest = RandomForestClassifier(n_estimators = 50)


# Fit the model
print "Fit the model"
forest = forest.fit(train_data[0::,1::],train_data[0::,0])

#Import the test data and run on the forest
testdata = pd.read_csv("C:\\test.csv",header = 0)

# Extracting the salutation from Name

Salutation = ['Rev.', 'Mrs.', 'Miss.', 'Master.', 'Mr.']
#Salutation = ['Rev.']

testdata['Salutation'] = 100


pattern = ''

for i in xrange(0,len(Salutation)):
pattern = Salutation[i]
print "pattern is",pattern
# print df[df['Name'].str.contains(pattern)]['Salutation'].replace(False, i)
#df['Salutation']= df['Name'].str.contains(pattern).replace('False', i)
testdata.loc[(testdata['Name'].str.contains(pattern)), 'Salutation'] = i
#print pattern, df['Salutation']
print testdata.head(2)

print testdata[(testdata.Salutation != 100)][['Name', 'Salutation']]

for i in range(1,4):
print "Class wise median Fare", i, testdata[(testdata.Pclass == i) & (testdata.Embarked == 'S')]['Fare'].dropna().median()

testdata['Gender'] = testdata.Sex.map({'female':0 , 'male' : 1}). astype(int)

median_ages = np.zeros((2,3))

for i in range(0,2):
for j in range(0,3):
median_ages[i,j] = testdata[(testdata.Gender == i) & (testdata.Pclass == j+1)]['Age'].dropna().median()

#print median_ages

testdata['AgeFill'] = testdata['Age']
#print testdata[testdata.Age.isnull()][['Age', 'AgeFill', 'Pclass', 'Sex']]

for i in range(0,2):
for j in range(0,3):
testdata.loc[(testdata.Age.isnull()) & (testdata.Gender == i) & (testdata.Pclass == j+1) , 'AgeFill'] = median_ages[i,j]

testdata['EmbarkedCode'] = testdata.Embarked.map({'C':0, 'S' : 1, 'Q':2})

testdata = testdata.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Age'], axis = 1)


testdata.loc[(testdata.EmbarkedCode.isnull()), 'EmbarkedCode'] = 1
testdata.loc[(testdata.Fare.isnull()), 'Fare'] = 8.05

test_data = testdata.values[0:]

output = forest.predict(test_data)
print forest
#print output[12]
#print test_data[152,0]

Final_output = np.zeros((418,2))

for i in range(0,418):
j = 0
str = test_data[i,0]
str.flatten()

Final_output[i,j] = str
for j in range (1,2):

output1 = output[i]
output1.flatten()

Final_output[i,j] = output1


np.savetxt("C:\\test_rf.csv",Final_output,delimiter = ",")



Please let us know if you want to discuss more about any advanced analytics problem.













No comments:

Post a Comment

Please share your thoughts and let us know the topics you want covered