Python is wonderful programming language. The more I explore, the more I am amazed about the power of this language.
Below is a recent Random forest implementation using Pandas and Sklearn libraries.
# -*- coding: utf-8 -*-
"""
Spyder Editor
This is Random forest implementation.
"""
import numpy as np
import csv as csv
import scipy as sp
import pandas as pandas
import matplotlib as mpl
import re
print "import successful"
csv_object = csv.reader(open("C:\\train.csv"), delimiter = ",")
# skipping the header
header = csv_object.next()
data = []
count = 0
for row in csv_object:
count +=1
data.append(row)
data = np.array(data)
#print data[0:15,5]
#print type(data[0::,5])
# Importing the pandas module to replace missing values
df = pd.read_csv("C:\\train.csv", header = 0)
# Extracting Salutation from Names
Salutation = ['Rev.', 'Mrs.', 'Miss.', 'Master.', 'Mr.']
#Salutation = ['Rev.']
df['Salutation'] = 100
pattern = ''
for i in xrange(0,len(Salutation)):
pattern = Salutation[i]
print "pattern is",pattern
# print df[df['Name'].str.contains(pattern)]['Salutation'].replace(False, i)
#df['Salutation']= df['Name'].str.contains(pattern).replace('False', i)
df.loc[(df['Name'].str.contains(pattern)), 'Salutation'] = i
#print pattern, df['Salutation']
print df.head(2)
print df[(df.Salutation != 100)][['Name', 'Salutation']]
df['Gender'] = df.Sex.map({'female':0 , 'male' : 1}). astype(int)
#print df[(df.Gender == 1) & (df.Pclass == 1)]
# Replace the ages in the dataset
median_ages = np.zeros((2,3))
for i in range(0,2):
for j in range(0,3):
median_ages[i,j] = df[(df.Gender == i) & (df.Pclass == j+1)]['Age'].dropna().median()
#print median_ages
df['AgeFill'] = df['Age']
#print df[df.Age.isnull()][['Age', 'AgeFill', 'Pclass', 'Sex', 'Survived']]
for i in range(0,2):
for j in range(0,3):
df.loc[(df.Age.isnull()) & (df.Gender == i) & (df.Pclass == j+1) , 'AgeFill'] = median_ages[i,j]
df['EmbarkedCode'] = df.Embarked.map({'C':0, 'S' : 1, 'Q':2})
#print df[df.Fare.isnull()][['Age', 'AgeFill', 'Pclass', 'Sex', 'Survived']]
#print df[df.Age.isnull()][['Age', 'AgeFill', 'Pclass', 'Sex', 'Survived', 'EmbarkedCode']]
#Convert the integer dataframe into numpy arraay
df = df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Age'], axis = 1)
#for i in range(0,3):
# print i, len(df[(df.EmbarkedCode == i)])
df.loc[df.EmbarkedCode.isnull()] = 1
#print df[df.EmbarkedCode.isnull()][['PassengerId']]
#print df.head(1)
#df = df.reindex(columns = pandas.Index(['Survived']).append(df.columns - ['Survived']))
df = df[['Survived', 'PassengerId', 'Pclass', 'SibSp', 'Parch', 'Fare', 'Gender', 'AgeFill', 'EmbarkedCode', 'Salutation']]
print df.head(2)
cols = df.columns.tolist()
train_data = df.values
print "Train data sample"
#print train_data[0::,1::]
#print train_data[0::,0]
# Start with Random Forest implementation
from sklearn.ensemble import RandomForestClassifier
# Initiate the random forest object
forest = RandomForestClassifier(n_estimators = 50)
# Fit the model
print "Fit the model"
forest = forest.fit(train_data[0::,1::],train_data[0::,0])
#Import the test data and run on the forest
testdata = pd.read_csv("C:\\test.csv",header = 0)
# Extracting the salutation from Name
Salutation = ['Rev.', 'Mrs.', 'Miss.', 'Master.', 'Mr.']
#Salutation = ['Rev.']
testdata['Salutation'] = 100
pattern = ''
for i in xrange(0,len(Salutation)):
pattern = Salutation[i]
print "pattern is",pattern
# print df[df['Name'].str.contains(pattern)]['Salutation'].replace(False, i)
#df['Salutation']= df['Name'].str.contains(pattern).replace('False', i)
testdata.loc[(testdata['Name'].str.contains(pattern)), 'Salutation'] = i
#print pattern, df['Salutation']
print testdata.head(2)
print testdata[(testdata.Salutation != 100)][['Name', 'Salutation']]
for i in range(1,4):
print "Class wise median Fare", i, testdata[(testdata.Pclass == i) & (testdata.Embarked == 'S')]['Fare'].dropna().median()
testdata['Gender'] = testdata.Sex.map({'female':0 , 'male' : 1}). astype(int)
median_ages = np.zeros((2,3))
for i in range(0,2):
for j in range(0,3):
median_ages[i,j] = testdata[(testdata.Gender == i) & (testdata.Pclass == j+1)]['Age'].dropna().median()
#print median_ages
testdata['AgeFill'] = testdata['Age']
#print testdata[testdata.Age.isnull()][['Age', 'AgeFill', 'Pclass', 'Sex']]
for i in range(0,2):
for j in range(0,3):
testdata.loc[(testdata.Age.isnull()) & (testdata.Gender == i) & (testdata.Pclass == j+1) , 'AgeFill'] = median_ages[i,j]
testdata['EmbarkedCode'] = testdata.Embarked.map({'C':0, 'S' : 1, 'Q':2})
testdata = testdata.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Age'], axis = 1)
testdata.loc[(testdata.EmbarkedCode.isnull()), 'EmbarkedCode'] = 1
testdata.loc[(testdata.Fare.isnull()), 'Fare'] = 8.05
test_data = testdata.values[0:]
output = forest.predict(test_data)
print forest
#print output[12]
#print test_data[152,0]
Final_output = np.zeros((418,2))
for i in range(0,418):
j = 0
str = test_data[i,0]
str.flatten()
Final_output[i,j] = str
for j in range (1,2):
output1 = output[i]
output1.flatten()
Final_output[i,j] = output1
np.savetxt("C:\\test_rf.csv",Final_output,delimiter = ",")
Please let us know if you want to discuss more about any advanced analytics problem.