tutorials

The official doc site has some great tutorials: http://pandas.pydata.org/pandas-docs/stable/tutorials.html

import/export data file

csv

# read from csv
df = pd.read_csv('data.csv')
# read csv from http
df = pd.read_csv('http://note.cheng10.cc/files/iris.csv')
# write to csv
df.to_csv('data.csv', index=False)

excel

xls = pd.ExcelFile('data.xlsx')
df = xls.parse('Sheet1')

filter & exclude

df_filtered = df[(df.salary >= 30000) & (df.year == 2017)]
df = df[(df.A == 1) & (df.D == 6)]
df = df[df.line_race != 0]
df = df[df['line race'] != 0]
df.loc[df['column_name'].isin(some_values)]
df.loc[~df['column_name'].isin(some_values)]

# filter df by column is_numberic
df = df[df.unit_id.apply(lambda x: x.isnumeric())]

merge

df = pd.merge(df_raw, df_pubid, sort=True, how=’left’, on=[‘pubid’, ])

find rows not in another dataframe

merged = df.merge(other, how='left', indicator=True)  
merged[merged['_merge']=='left_only']

groupby & aggregate

df14_amazon.groupby(['pubid', 'source', 'pkg']).agg({
    'click': 'sum', 'conv': 'sum', 'convf': 'sum', 'revenue': 'sum', 'revenuef': 'sum'
}).reset_index()
df_watch_count = df_user_watch.groupby(['user_id', 'creative_id']).agg({
    "progress": ["max", "count"],
}).reset_index()

slice

df = df.head(10)

sort

df = df.sort_values(['revenue', 'pubid'], ascending=False)

append

df1 = df1.append(df2)
df = pd.concat([df1, df2])

rename

df = df.rename(columns={'old_name': 'new_name'})

selecting columns

df1 = df[['color', 'food', 'score']]

flatten multiindex column

df.columns = df.columns.map('_'.join)

deduplicate

df = df.drop_duplicates(['campaign_id'])
subset : column label or sequence of labels, optional
Only consider certain columns for identifying duplicates, by default use all of the columns

to dict

df = df.reset_index().to_dict('records')
# reset index when you after merge, to flatten multi level index

change data type

df['conv'] = df['conv'].astype('int')

count rows number

# 1. df.shape
# 2. df[0].count()
# 3. len(df.index)
# fastest: len(df.index)

https://stackoverflow.com/questions/15943769/how-do-i-get-the-row-count-of-a-pandas-dataframe

dropna

# 1. drop all na
df = df.dropna()
# 2. drop subset of columns
df = df.dropna(subset=['name', 'born'])

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dropna.html

fillna

df = df.fillna(0)
df = df.fillna(value={'A': 0, 'B': ''}

remove infinity

df = df.replace([np.inf, -np.inf], np.nan)
df = df.fillna(0)

round

df = df.round(2)

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.round.html

sum/concat string with separator

with groupby:

>>> df
   a   b
0  1  bb
1  1  cc
>>> df.groupby(['a'])['b'].apply(", ".join).reset_index()
   a      b
0  1  bb,cc

without groupby:

>>> s = pd.Series(['a', 'b', np.nan, 'd'])
>>> s.str.cat(sep=' ')
'a b d'

concat columns

>>> df['e'] = df['b'].str.cat(df['c'], sep=',')
   a   b   c      e
0  1  bb  dd  bb,dd
1  1  cc  ee  cc,ee

isnull

df[df['City'].isnull()]

condition

# Set a default value
df['Age_Group'] = '<40'
# Set Age_Group value for all row indexes which Age are greater than 40
df['Age_Group'][df['Age'] > 40] = '>40'
# Set Age_Group value for all row indexes which Age are greater than 18 and < 40
df['Age_Group'][(df['Age'] > 18) & (df['Age'] < 40)] = '>18'
# Set Age_Group value for all row indexes which Age are less than 18
df['Age_Group'][df['Age'] < 18] = '<18'