A crash course on Python II – Read and analyse a csv file using Python classes

Create a class

class Employee:
   'Common base class for all employees'
   empCount = 0

   def __init__(self, name, salary):
      self.name = name
      self.salary = salary
      Employee.empCount += 1
   
   def displayCount(self):
     print "Total Employee %d" % Employee.empCount

   def displayEmployee(self):
      print "Name : ", self.name,  ", Salary: ", self.salary

Create and use objects

emp1 = Employee("Zara", 2000)
emp2 = Employee("Manni", 5000)
emp1.displayEmployee()
emp2.displayEmployee()
print "Total Employee %d" % Employee.empCount
'''
Output
Name :  Zara ,Salary:  2000
Name :  Manni ,Salary:  5000
Total Employee 2
'''

Complete class/object example –

#!/usr/bin/python

class Employee:
   'Common base class for all employees'
   empCount = 0

   def __init__(self, name, salary):
      self.name = name
      self.salary = salary
      Employee.empCount += 1
   
   def displayCount(self):
     print "Total Employee %d" % Employee.empCount

   def displayEmployee(self):
      print "Name : ", self.name,  ", Salary: ", self.salary

emp1 = Employee("Zara", 2000)
emp2 = Employee("Manni", 5000)
emp1.displayEmployee()
emp2.displayEmployee()
print "Total Employee %d" % Employee.empCount

We are going to create a Python class to read and analyze this csv file. A few rows of the csv is given below –

school sex age address famsize Pstatus Medu Fedu Mjob Fjob reason guardian traveltime studytime failures schoolsup famsup paid activities nursery higher internet romantic famrel freetime goout Dalc Walc health absences G1 G2 G3
GP F 18 U GT3 A 4 4 at_home teacher course mother 2 2 0 yes no no no yes yes no no 4 3 4 1 1 3 6 5 6 6
GP F 17 U GT3 T 1 1 at_home other course father 1 2 0 no yes no no no yes yes no 5 3 3 1 1 3 4 5 5 6
GP F 15 U LE3 T 1 1 at_home other other mother 1 2 3 yes no yes no yes yes yes no 4 3 2 2 3 3 10 7 8 10
GP F 15 U GT3 T 4 2 health services home mother 1 3 0 no yes yes yes yes yes yes yes 3 2 2 1 1 5 2 15 14 15
GP F 16 U GT3 T 3 3 other other home father 1 2 0 no yes yes no yes yes no no 4 3 2 1 2 5 4 6 10 10
GP M 16 U LE3 T 4 3 services other reputation mother 1 2 0 no yes yes yes yes yes yes no 5 4 2 1 2 5 10 15 15 15
GP M 16 U LE3 T 2 2 other other home mother 1 2 0 no no no no yes yes yes no 4 4 4 1 1 3 0 12 12 11

The only analysis we are going to do on this tutorial is finding the mean, median, mode, and standard deviation of the above file.

We need a class with the following methods –

  1. Read the file
  2. Pre-process the file
  3. Find
    1. Mean
    2. Median
    3. Mode
    4. Standard Deviation

A template of our required class is given below

class DACourse:
	def __init__(self, ip_file_name):
		#Initialise the class
		print "Initialize class to object; opening file - "+ip_file_name
   
	def preprocess(self):
		#Pre-process the file
		print "Pre-process"

	def findMean(self):
		#Find mean
		print "Find-mean"

	def findMedian(self):
		#Find median
		print "Find-median"

	def findMode(self):
		#Find mode
		print "Find-mode"

	def findSD(self):
		#Find standard deviation
		print "Find-sd"

You can initialize and call the various methods in the above class

$ python
> from DACourse import *
> obj = DACourse("myfile")
Initialize class to object; opening file - myfile
> obj.preprocess()
Pre-process
> obj.findMean()
Find-mean
> obj.findMedian()
Find-median
> obj.findMode()
Find-mode
> obj.findSD()
Find-sd

The complete class is given below –

import math
class DACourse:
	def __init__(self, ip_file_name):
		self.ip_file_name = ip_file_name
   
	def preprocess(self, ip_column):
		file = open(self.ip_file_name)
		csv = file.readlines()
		csv.pop(0)
		main_array = []
		for row in csv:
			row_remove_nr = row.rstrip('\n\r')
			row_splitted = row_remove_nr.split(",")
			main_array.append(int(row_splitted[ip_column]))
		self.data_array = main_array

	def findMean(self):
		data_sum = sum(self.data_array)
		return float(data_sum)/len(self.data_array)

	def findMedian(self):
		self.data_array.sort()
		if(len(self.data_array)%2==0):
			return float(self.data_array[len(self.data_array)/2-1]+self.data_array[len(self.data_array)/2])/2
		else:
			return self.data_array[len(self.data_array)/2]

	def findMode(self):
		mode_array = [0]*(max(self.data_array)+1)
		for data in self.data_array:
			mode_array[data] += 1
		return mode_array.index(max(mode_array))

	def findSD(self):
		data_sum = sum(self.data_array)
		mean = float(data_sum)/len(self.data_array)
		ssd = 0
		for data in self.data_array:
			ssd += (data-mean)**2
		variance = float(ssd)/(len(self.data_array)-1)
		return math.sqrt(variance)