In [1]:
1
2
3
4
import pymysql, argparse, os, sys, json
from sqlalchemy import create_engine
import pandas as pd
import matplotlib.pyplot as plt
In [2]:
1
2
3
4
5
6
7
8
9
10
11
parser = argparse.ArgumentParser()
parser.add_argument('-user', help="mysql database user", type=str, required=False, default='****')
parser.add_argument('-pw', help="password", type=str, required=False, default='****')
parser.add_argument('-host', help="ip address", type=str, required=False, default='***.***.***.***')
parser.add_argument('-db', help="database name", type=str, required=False, default='classicmodels')
parser.add_argument('-charset', help="character set to use", type=str, required=False, default='utf8mb4')
sys.argv = ['-f']
args = parser.parse_args()
print(args)
con = pymysql.connect(host=args.host, user=args.user, password=args.pw, use_unicode=True, charset=args.charset)
cursor = con.cursor()
1
2
Namespace(charset='utf8mb4', db='classicmodels', host='***.***.***.***', pw='****', user='****')

In [3]:
1
2
3
4
5
6
7
8
9
10
11
# helper functions 
sql = lambda command: pd.read_sql(command, con)
def fetch(command):
    cursor.execute(command)
    return cursor.fetchall()

# db_data = 'mysql+pymysql://' + '<USER-NAME>' + ':' + '<PASSWORD>' + '@' + '***.***.***.***' + ':3306/' + '<DB-NAME>' + '?charset=utf8mb4'
db_data = "mysql+pymysql://{}:{}@{}:3306/{}?charset={}".format(args.user, args.pw, args.host, args.db, args.charset)
engine = create_engine(db_data).connect()

fetch("use classicmodels")
1
()

Pandas Tutorial

Prerequisite

  1. How to use pymysql and mysql
  2. How to use argparse module in jupyter notebook

I use sample database classicmodels on MySQL in this tutorial

In [4]:
1
sql("show tables")
Tables_in_classicmodels
0 customers
1 employees
2 music
3 offices
4 orderdetails
5 orders
6 payments
7 productlines
8 products
In [5]:
1
df = sql("select * from customers")
In [6]:
1
df.info()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   customerNumber          122 non-null    int64  
 1   customerName            122 non-null    object 
 2   contactLastName         122 non-null    object 
 3   contactFirstName        122 non-null    object 
 4   phone                   122 non-null    object 
 5   addressLine1            122 non-null    object 
 6   addressLine2            22 non-null     object 
 7   city                    122 non-null    object 
 8   state                   49 non-null     object 
 9   postalCode              115 non-null    object 
 10  country                 122 non-null    object 
 11  salesRepEmployeeNumber  100 non-null    float64
 12  creditLimit             122 non-null    float64
dtypes: float64(2), int64(1), object(10)
memory usage: 12.5+ KB

Indexing and selecting

In [7]:
1
df.head(n=1)
customerNumber customerName contactLastName contactFirstName phone addressLine1 addressLine2 city state postalCode country salesRepEmployeeNumber creditLimit
0 103 Atelier graphique Schmitt Carine 40.32.2555 54, rue Royale None Nantes None 44000 France 1370.0 21000.0
In [8]:
1
df.iloc[0]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
customerNumber                          103
customerName              Atelier graphique
contactLastName                     Schmitt
contactFirstName                    Carine 
phone                            40.32.2555
addressLine1                 54, rue Royale
addressLine2                           None
city                                 Nantes
state                                  None
postalCode                            44000
country                              France
salesRepEmployeeNumber                 1370
creditLimit                           21000
Name: 0, dtype: object
In [9]:
1
df.sort_values(by='creditLimit')[:3]
customerNumber customerName contactLastName contactFirstName phone addressLine1 addressLine2 city state postalCode country salesRepEmployeeNumber creditLimit
74 335 Cramer Spezialitäten, Ltd Cramer Philip 0555-09555 Maubelstr. 90 None Brandenburg None 14776 Germany NaN 0.0
41 223 Natürlich Autos Kloss Horst 0372-555188 Taucherstraße 10 None Cunewalde None 01307 Germany NaN 0.0
44 237 ANG Resellers Camino Alejandra (91) 745 6555 Gran Vía, 1 None Madrid None 28001 Spain NaN 0.0
In [10]:
1
df.sort_values(by='creditLimit')[['customerNumber', 'phone', 'city', 'creditLimit']]
customerNumber phone city creditLimit
74 335 0555-09555 Brandenburg 0.0
41 223 0372-555188 Cunewalde 0.0
44 237 (91) 745 6555 Madrid 0.0
48 247 069-0555984 Frankfurt 0.0
109 465 +34 913 728555 Madrid 0.0
... ... ... ... ...
29 187 (171) 555-1555 Manchester 136800.0
15 151 2125557413 NYC 138500.0
60 298 0897-034555 Genève 141300.0
5 124 4155551450 San Rafael 210500.0
10 141 (91) 555 94 44 Madrid 227600.0

122 rows × 4 columns

Merge and Join

In [11]:
1
df2 = sql("select * from orders")
In [12]:
1
df2.head(n=2)
orderNumber orderDate requiredDate shippedDate status comments customerNumber
0 10100 2003-01-06 2003-01-13 2003-01-10 Shipped None 363
1 10101 2003-01-09 2003-01-18 2003-01-11 Shipped Check on availability. 128
In [13]:
1
len(pd.merge(left=df, right=df2))
1
326
In [14]:
1
2
# sanity check through sql.
sql("select count(*) from customers c, orders o where c.customerNumber = o.customerNumber")
count(*)
0 326

Pivot table

In [15]:
1
2
3
4
5
6
df = sql("select * from music")
df.head(n=2)
df = df.pivot(index='user_id', columns='item_id', values='ratings')
num_user, num_items = df.shape
print("# of users: {:,}, # of items: {:,}".format(num_user, num_items))
df.head(n=3)
1
2
# of users: 1,429, # of items: 900

item_id 0 1 2 3 4 5 6 7 8 9 ... 890 891 892 893 894 895 896 897 898 899
user_id
0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

3 rows × 900 columns

In [16]:
1
"density rate: {:.3f}%".format((1 - (df.isna().sum(axis=0).sum() / (num_user * num_items))) * 100)
1
'density rate: 0.798%'

Visualization

I would like to know order statistics for people with a large credit limit group by city of residence in the customer table.

In [17]:
1
df = sql("select * from customers")
In [18]:
1
df.head(n=1)
customerNumber customerName contactLastName contactFirstName phone addressLine1 addressLine2 city state postalCode country salesRepEmployeeNumber creditLimit
0 103 Atelier graphique Schmitt Carine 40.32.2555 54, rue Royale None Nantes None 44000 France 1370.0 21000.0
In [19]:
1
sql("select count(distinct city) from customers where creditLimit > 1000")
count(distinct city)
0 77
In [20]:
1
2
3
4
5
6
res = sql(
    """ select city, avg(creditLimit) as creditLimit
        from customers 
        where creditLimit > 0 
        group by city 
        order by creditLimit""")
In [21]:
1
res.head(n=3)
city creditLimit
0 Charleroi 23500.0
1 Glendale 30350.0
2 Milan 34800.0

To simplify this problem, let’s pick top 10 countries.

In [22]:
1
res = res.nlargest(n=10, columns='creditLimit')
In [23]:
1
2
plt.figure()
res.plot(x='city', y='creditLimit', kind='bar')
1
<matplotlib.axes._subplots.AxesSubplot at 0x7f03a54a8810>
1
<Figure size 432x288 with 0 Axes>

png


ETC

Apply function to row-wise(axis=1) or column-wise(axis=0)

In [24]:
1
2
3
4
def func(x):
    import pdb; pdb.set_trace()
    return  x
df.apply(lambda x: x['city'], axis=1)
1
2
3
4
5
6
7
8
9
10
11
12
0            Nantes
1         Las Vegas
2         Melbourne
3            Nantes
4           Stavern
           ...     
117    Philadelphia
118        Brisbane
119          London
120          Boston
121      Auckland  
Length: 122, dtype: object

Export pandas.DataFrame to csv file.

In [25]:
1
df.to_csv('./customers.csv')

Options

Display options

In [26]:
1
2
3
4
5
# display options 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
# pd.reset_option('^display') # reset options of starting with 'display'.

Referenece

[1]Offical pandas Guide
[2]MySQL sample database description

Tags:

Categories:

Updated:

Leave a comment