Pandas - DataFrame
In [7]:
#importing pandas and DataFrame
import pandas as pd
from pandas import DataFrame
#constructor to create a data frame
#df=DataFrame( data, index, columns, dtype, copy)
#Lists, dict, Series, Numpy ndarrays, Another DataFrame
#creating an empty data frame
df=DataFrame()
#creating a data frame form an array
df=DataFrame([1,2,3,4,5,5])
print df
In [8]:
# creating a dataframe with an array of array
# each inner array represents a row
d2=[[1,2,3,4],
['a','b','c','d']]
df= DataFrame(d2)
print df
In [9]:
#creating a datframe from a dict
import pandas as pd
data = {'Name':['Tom', 'Jack', 'Steve', 'Ricky'],'Age':[28,34,29,42]}
df = pd.DataFrame(data,columns=['Age','Name','a'])
print df
In [10]:
# creating a df from a list of dictionaries
import pandas as pd
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
df = pd.DataFrame(data)
print df
In [11]:
#create a data frame
data = {'Name':['Tom', 'Jack', 'Steve', 'Ricky'],'Age':[28,34,29,42]}
df = pd.DataFrame(data,columns=['Age','Name','a'])
print df
In [12]:
#understand the data
#gives brief statistics about the data like mean,count,standard deviation, percentile limits
print df.describe()
In [14]:
#gives information about the data frame like numbr of columns,adta types memory usage
print df.info()
In [15]:
#to get all the columns
print df.columns
print "---------"
# to print all the values
print df.values
print "---------"
# to print the index
print df.index
In [17]:
# selecting a column from the dataframe
print df['Name']
print "----------"
#selecting multiple columns
print df[['Name','Age']]
print "-----------"
# adding a new column
df['Gender']=['M','M','M','M']
print df
print "------------"
#deleting a column from data frame
del df['Gender']
# or df.pop('a')
In [24]:
# selecting a row by integer index
df.iloc[1,1]
Out[24]:
In [27]:
# selecting a row by label index
df.loc[1,"Name"]
Out[27]:
In [29]:
# selecting a row by integer index
df.ix[3,1]
Out[29]:
In [39]:
for row in df.iterrows():
#returns the tuples with (label,row) mapping
#check the data type of tuple keys
print "return type of iterrows ",type(row[0])
print "values of tupe are of type ",type(row[1])
break
for row in df.iterrows():
print "Name is : ",row[1]['Name']
print "Age is : ",row[1]['Age']
In [71]:
# returns the columns with values in columns
for row in df.iteritems():
print row[0]
print row[1]
In [78]:
# gives rows as tuples
for row in df.itertuples():
print row
In [43]:
print df
df1=df.reindex(index=[1,2],columns=['Name','Age'])
df1
Out[43]:
In [46]:
#changing the index values of a Data Frame
df.index=['rank1','rank3','rank2','rank4']
df
Out[46]:
In [82]:
import numpy as np
unsorted_df=pd.DataFrame(np.random.randn(10,2),index=[1,4,6,2,3,5,9,8,0,7],columns=['col2','col1'])
#sorting can be done in
#sorting based on index
print unsorted_df.sort_index()
print "-----------------------------"
print unsorted_df.sort_index(ascending=False)
In [86]:
#sorting based on values
#meathod takes a column name or list of columns as input
unsorted_df.sort_values('col1')
unsorted_df.sort_values(['col1','col2'])
Out[86]:
In [87]:
#### Statistical Functions with DataFrame
In [95]:
#calculate the percent change on rolling window of 1
print unsorted_df.pct_change(1)
print "---------------"
print "correlation matrix"
print unsorted_df.corr()
In [100]:
#rolling functions are window based functions which can be applied on a a set of rows
# calclulate the mean based on every 4 rows in a data frame
print df.rolling(window=2).mean()
print df.rolling(window=2).sum()
#applying custome function to adta frame
def fun(s):
print s
return s[0]
print df.rolling(window=2).agg(fun)
In [105]:
# import the pandas library
import pandas as pd
ipl_data = {'Team': ['Riders', 'Riders', 'Devils', 'Devils', 'Kings',
'kings', 'Kings', 'Kings', 'Riders', 'Royals', 'Royals', 'Riders'],
'Rank': [1, 2, 2, 3, 3,4 ,1 ,1,2 , 4,1,2],
'Year': [2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,2015,2017],
'Points':[876,789,863,673,741,812,756,788,694,701,804,690]}
df = pd.DataFrame(ipl_data)
#group by creates groups of the data frame rows
print df.groupby('Team').groups
#printing the group names
for name,group in df.groupby('Team'):
print name
print group
In [108]:
#applying sum function in a dataframe
print df.groupby('Team').sum()
#applying sum function on a column
print df.groupby('Team')['Points'].sum()
#or
print df.groupby('Team')['Points'].agg(np.sum)
In [109]:
# applying multiple functions
print df.groupby(['Rank'])['Points'].agg([np.sum, np.mean, np.std])
print df.groupby(['Rank','Team'])['Points'].agg([np.sum, np.mean, np.std])
Comments
Post a Comment