Pandas5
In [15]:
Copied!
import pandas as pd
import pandas as pd
df from list of lists¶
In [16]:
Copied!
data = [['tom', 10], ['nick', 15], ['juli', 20]]
df = pd.DataFrame(data, columns =['Name', 'Age'])
df
data = [['tom', 10], ['nick', 15], ['juli', 20]]
df = pd.DataFrame(data, columns =['Name', 'Age'])
df
Out[16]:
Name | Age | |
---|---|---|
0 | tom | 10 |
1 | nick | 15 |
2 | juli | 20 |
In [17]:
Copied!
df.shape
df.shape
Out[17]:
(3, 2)
In [25]:
Copied!
df.index
df.index
Out[25]:
RangeIndex(start=0, stop=3, step=1)
In [26]:
Copied!
df.columns
df.columns
Out[26]:
Index(['Name', 'Age'], dtype='object')
In [27]:
Copied!
df.info()
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3 entries, 0 to 2 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Name 3 non-null object 1 Age 3 non-null int64 dtypes: int64(1), object(1) memory usage: 176.0+ bytes
In [28]:
Copied!
df.describe()
df.describe()
Out[28]:
Age | |
---|---|
count | 3.0 |
mean | 15.0 |
std | 5.0 |
min | 10.0 |
25% | 12.5 |
50% | 15.0 |
75% | 17.5 |
max | 20.0 |
df from dictionary of lists¶
In [31]:
Copied!
data = {'Name':['tom', 'nick', 'juli'],
'Age':[10, 15, 20]}
df = pd.DataFrame(data)
df
data = {'Name':['tom', 'nick', 'juli'],
'Age':[10, 15, 20]}
df = pd.DataFrame(data)
df
Out[31]:
Name | Age | |
---|---|---|
0 | tom | 10 |
1 | nick | 15 |
2 | juli | 20 |
df from list of dictionaries¶
In [32]:
Copied!
data = [{'Name':'tom', 'Age':10},
{'Name':'nick', 'Age':15},
{'Name':'Juli', 'Age':20}]
df = pd.DataFrame(data)
df
data = [{'Name':'tom', 'Age':10},
{'Name':'nick', 'Age':15},
{'Name':'Juli', 'Age':20}]
df = pd.DataFrame(data)
df
Out[32]:
Name | Age | |
---|---|---|
0 | tom | 10 |
1 | nick | 15 |
2 | Juli | 20 |
df using zip() function¶
In [34]:
Copied!
name = ['tom', 'nick', 'juli']
age = [10, 15, 20]
df = pd.DataFrame(list(zip(name, age)), columns = ['Name', 'Age'])
df
name = ['tom', 'nick', 'juli']
age = [10, 15, 20]
df = pd.DataFrame(list(zip(name, age)), columns = ['Name', 'Age'])
df
Out[34]:
Name | Age | |
---|---|---|
0 | tom | 10 |
1 | nick | 15 |
2 | juli | 20 |
df manipulation¶
In [35]:
Copied!
df['Name']
df['Name']
Out[35]:
0 tom 1 nick 2 juli Name: Name, dtype: object
In [36]:
Copied!
type(df['Name'])
type(df['Name'])
Out[36]:
pandas.core.series.Series
In [37]:
Copied!
df[['Name']]
df[['Name']]
Out[37]:
Name | |
---|---|
0 | tom |
1 | nick |
2 | juli |
In [38]:
Copied!
type(df[['Name']])
type(df[['Name']])
Out[38]:
pandas.core.frame.DataFrame
In [39]:
Copied!
df['Age'].max()
df['Age'].max()
Out[39]:
20
In [40]:
Copied!
df[df['Age']>10]
df[df['Age']>10]
Out[40]:
Name | Age | |
---|---|---|
1 | nick | 15 |
2 | juli | 20 |
In [42]:
Copied!
df[df['Age']>10][['Name']]
df[df['Age']>10][['Name']]
Out[42]:
Name | |
---|---|
1 | nick |
2 | juli |
In [46]:
Copied!
df.loc[df.Name.str.contains('nic')]
df.loc[df.Name.str.contains('nic')]
Out[46]:
Name | Age | |
---|---|---|
1 | nick | 15 |
In [52]:
Copied!
df1 = df[~df['Name'].str.contains('nick')]
df1
df1 = df[~df['Name'].str.contains('nick')]
df1
Out[52]:
Name | Age | |
---|---|---|
0 | tom | 10 |
2 | juli | 20 |
In [54]:
Copied!
df1 = df[~df['Name'].isin(['nick', 'tom'])]
df1
df1 = df[~df['Name'].isin(['nick', 'tom'])]
df1
Out[54]:
Name | Age | |
---|---|---|
2 | juli | 20 |
In [55]:
Copied!
df[~df['Name'].str.contains("#@'£$%abc?", regex=False)]
df[~df['Name'].str.contains("#@'£$%abc?", regex=False)]
Out[55]:
Name | Age | |
---|---|---|
0 | tom | 10 |
1 | nick | 15 |
2 | juli | 20 |
In [57]:
Copied!
df['tmp'] = df['Name'].str.replace('o','oo')
df
df['tmp'] = df['Name'].str.replace('o','oo')
df
Out[57]:
Name | Age | tmp | |
---|---|---|---|
0 | tom | 10 | toom |
1 | nick | 15 | nick |
2 | juli | 20 | juli |
In [58]:
Copied!
df1 = df.loc[df.Name.str.len() > len('hello')].reset_index(drop=True)
df1
df1 = df.loc[df.Name.str.len() > len('hello')].reset_index(drop=True)
df1
Out[58]:
Name | Age | tmp |
---|
In [4]:
Copied!
df1 = df[df['Name'].notna()]
df1
df1 = df[df['Name'].notna()]
df1
Out[4]:
Name | Age | |
---|---|---|
0 | tom | 10 |
1 | nick | 15 |
2 | juli | 20 |
In [7]:
Copied!
import numpy as np
import numpy as np
In [10]:
Copied!
df['Age'] = df['Age'].astype(int)
df['Age'] = df['Age'].astype(int)
In [14]:
Copied!
df = df.apply(lambda x:np.square(x) if x.name == 'd' else x, axis=1)
df = df.apply(lambda x:np.square(x) if x.name == 'd' else x, axis=1)
In [ ]:
Copied!
df.apply(lambda x: func(x['col1'], x['col2']), axis=1)
df.apply(lambda x: func(x['col1'], x['col2']), axis=1)
In [ ]:
Copied!
df['c'] = df.apply(lambda x:max(len(x['a']), len(x['b'])), axis=1)
df['c'] = df.apply(lambda x:max(len(x['a']), len(x['b'])), axis=1)
In [ ]:
Copied!
df.loc[df['a'].str.contains('/'), 'new_col'] = df['a'] #create new column new_col from col c
df.loc[df['a'].str.contains('/'), 'new_col'] = df['a'] #create new column new_col from col c
In [ ]:
Copied!
df.loc[df['a']=='Unknown', 'new_col']=df['a']=='Unknown']['b'].apply(lambda x: x.split('/')[4])
df.loc[df['a']=='Unknown', 'new_col']=df['a']=='Unknown']['b'].apply(lambda x: x.split('/')[4])
In [ ]:
Copied!
df['a'] = df['b'].map(str) + df['c'].map(str)
df['a'] = df['b'].map(str) + df['c'].map(str)
In [ ]:
Copied!
df['c'] = np.where(
df['a']==df['b'],0, np.where(
df['a']>df['b'],1,-1))
df['c'] = np.where(
df['a']==df['b'],0, np.where(
df['a']>df['b'],1,-1))
In [ ]:
Copied!
def f(row):
if row['a'] == row['b']:
val = 0
elif row['a'] > row['b']:
val = 1
else:
val = -1
return val
df['c'] = df.apply(f, axis=1)
def f(row):
if row['a'] == row['b']:
val = 0
elif row['a'] > row['b']:
val = 1
else:
val = -1
return val
df['c'] = df.apply(f, axis=1)
In [ ]:
Copied!
df[(df['a'])>=1) & (df['b']<=1)]
df[(df['a'])>=1) & (df['b']<=1)]
In [ ]:
Copied!
df['a'] = np.where(df[b].apply(lambda x:x in list1), True, False)
df['a'] = np.where(df[b].apply(lambda x:x in list1), True, False)
In [ ]:
Copied!
df['user2'] = np.where((df['user']=='Unknown'), df['path'].apply(lambda x: x.split('/')[4]), df['user'])
df['user2'] = np.where((df['user']=='Unknown'), df['path'].apply(lambda x: x.split('/')[4]), df['user'])
In [ ]:
Copied!
df.sort_values(['date'], ascending=[False])
df.sort_values(['date'], ascending=[False])
In [ ]:
Copied!
df.groupby(['a','b','c']).\
agg({'latest_date':'max', 'size_in_mb':'sum', 'path':'count'}).\
rename(columns={'path':'file_count'}).\
reset_index().sort_values('size_in_mb', ascending=0).\
reset.index().drop(columns={'index'})
df.groupby(['a','b','c']).\
agg({'latest_date':'max', 'size_in_mb':'sum', 'path':'count'}).\
rename(columns={'path':'file_count'}).\
reset_index().sort_values('size_in_mb', ascending=0).\
reset.index().drop(columns={'index'})
In [ ]:
Copied!
In [43]:
Copied!
pd.set_option('max_colwidth', 1000)
pd.set_option('max_colwidth', 1000)
In [5]:
Copied!
pd.options.mode.chained_assignment = None # default='warn'
pd.options.mode.chained_assignment = None # default='warn'
In [ ]:
Copied!
df = pd.read_csv('student.csv')
df = pd.read_csv('student.csv')
In [ ]:
Copied!
df = ps.read_csv('path/abc.csv', header=None, delimiter=',', index_col=False,
warn_bad_lines=True, error_bad_lines=False, skiprows=[i for i in range(100, 200)])
df = ps.read_csv('path/abc.csv', header=None, delimiter=',', index_col=False,
warn_bad_lines=True, error_bad_lines=False, skiprows=[i for i in range(100, 200)])
In [ ]:
Copied!
df.to_csv('student.csv', index=False)
df.to_csv('student.csv', index=False)
In [ ]:
Copied!
In [ ]:
Copied!
start_index = lst_path[:lst_path.find(key_word)-1].rfind('/')
start_index = lst_path[:lst_path.find(key_word)-1].rfind('/')
In [ ]:
Copied!
lst_string = [x.decode('utf-8', 'ignore') for x in list_of_byte_string]
lst_string = [x.decode('utf-8', 'ignore') for x in list_of_byte_string]
In [ ]:
Copied!