In [14]:
1
2
3
import pymysql, argparse, os, sys, json
from sqlalchemy import create_engine
import pandas as pd

How to use pymysql using Pandas, MySQL

Step 0. Connect to MySQL

In [2]:
1
2
3
4
5
6
7
8
9
parser = argparse.ArgumentParser()
parser.add_argument('-user', help="mysql database user", type=str, required=False, default='swyoo')
parser.add_argument('-pw', help="password", type=str, required=False, default='****')
parser.add_argument('-host', help="ip address", type=str, required=False, default='***.***.***.***')
parser.add_argument('-db', help="database name", type=str, required=False, default='classicmodels')
parser.add_argument('-charset', help="character set to use", type=str, required=False, default='utf8mb4')
sys.argv = ['-f']
args = parser.parse_args()
args
1
Namespace(charset='utf8mb4', db='classicmodels', host='125.191.6.186', pw='1360', user='swyoo')
In [3]:
1
2
3
4
5
# helper functions 
sql = lambda command: pd.read_sql(command, con)
def fetch(command):
    cursor.execute(command)
    return cursor.fetchall()
In [4]:
1
2
con = pymysql.connect(host=args.host, user=args.user, password=args.pw, use_unicode=True, charset=args.charset)
cursor = con.cursor()

Step 1. Use sqlalchemy, create engine

I use sample database classicmodels on MySQL in this tutorial

Create engine to convert pandas.Datafame to a table in MySQL.
Therefore, connect to a database classicmodels on MySQL

In [5]:
1
2
3
4
5
6
fetch("create database if not exists testdb")

""" insert dataset to database """
# db_data = 'mysql+pymysql://' + '<USER-NAME>' + ':' + '<PASSWORD>' + '@' + '***.***.***.***' + ':3306/' + '<DB-NAME>' + '?charset=utf8mb4'
db_data = "mysql+pymysql://{}:{}@{}:3306/{}?charset={}".format(args.user, args.pw, args.host, args.db, args.charset)
engine = create_engine(db_data).connect()
1
2
3
/home/swyoo/anaconda3/envs/torch/lib/python3.7/site-packages/pymysql/cursors.py:170: Warning: (1007, "Can't create database 'testdb'; database exists")
  result = self._query(query)

In [6]:
1
2
fetch("use classicmodels")
sql("show tables")
Tables_in_classicmodels
0 customers
1 employees
2 offices
3 orderdetails
4 orders
5 payments
6 productlines
7 products

Step2. Prepare dataframe

In [11]:
1
2
!wget -N http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Musical_Instruments_5.json.gz
!gzip -d reviews_Musical_Instruments_5.json.gz    
1
2
3
4
5
6
7
8
9
10
11
12
--2020-05-08 18:03:39--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Musical_Instruments_5.json.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2460495 (2.3M) [application/x-gzip]
Saving to: ‘reviews_Musical_Instruments_5.json.gz’

reviews_Musical_Ins 100%[===================>]   2.35M  4.85MB/s    in 0.5s    

2020-05-08 18:03:40 (4.85 MB/s) - ‘reviews_Musical_Instruments_5.json.gz’ saved [2460495/2460495]


In [17]:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
DATA_JSON = "reviews_Musical_Instruments_5.json"
""" read data from *.json """
users_id = []
items_id = []
ratings = []
reviews = []
with open(DATA_JSON, 'r') as f:
    for line in f:
        js = json.loads(line)
        if str(js['reviewerID']) == 'unknown':
            print("unknown")
            continue
        if str(js['asin']) == "unknown":
            print("unknown")
            continue
        reviews.append(js['reviewText'])
        users_id.append(str(js['reviewerID']) + ",")
        items_id.append(str(js['asin']) + ",")
        ratings.append(str(js['overall']))

df = pd.DataFrame(
    {'user_id': pd.Series(users_id),
     'item_id': pd.Series(items_id),
     'ratings': pd.Series(ratings),
     'reviews': pd.Series(reviews)}
)[['user_id', 'item_id', 'ratings', 'reviews']]

"""
https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html
user_id column을 pandas.Series.astype 중에서 dtype=category 데이터 타입으로 바꾼후
"""
user_id_int = df.user_id.astype('category').cat.codes
item_id_int = df.item_id.astype('category').cat.codes
df.user_id = user_id_int.astype('int64')
df.item_id = item_id_int.astype('int64')
df.ratings = df.ratings.astype('float64')
In [18]:
1
df
user_id item_id ratings reviews
0 550 0 5.0 Not much to write about here, but it does exac...
1 55 0 5.0 The product does exactly as it should and is q...
2 92 0 5.0 The primary job of this device is to block the...
3 482 0 5.0 Nice windscreen protects my MXL mic and preven...
4 1145 0 5.0 This pop filter is great. It looks and perform...
... ... ... ... ...
10256 45 899 5.0 Great, just as expected. Thank to all.
10257 279 899 5.0 I've been thinking about trying the Nanoweb st...
10258 1390 899 4.0 I have tried coated strings in the past ( incl...
10259 720 899 4.0 Well, MADE by Elixir and DEVELOPED with Taylor...
10260 683 899 4.0 These strings are really quite good, but I wou...

10261 rows × 4 columns

Step3. pandas to mysql table using engine

In [20]:
1
2
df.to_sql(name='music', con=engine, if_exists='replace')
print("insert all data set to database done")
1
2
insert all data set to database done

In [21]:
1
sql("select * from music")
index user_id item_id ratings reviews
0 0 550 0 5.0 Not much to write about here, but it does exac...
1 1 55 0 5.0 The product does exactly as it should and is q...
2 2 92 0 5.0 The primary job of this device is to block the...
3 3 482 0 5.0 Nice windscreen protects my MXL mic and preven...
4 4 1145 0 5.0 This pop filter is great. It looks and perform...
... ... ... ... ... ...
10256 10256 45 899 5.0 Great, just as expected. Thank to all.
10257 10257 279 899 5.0 I've been thinking about trying the Nanoweb st...
10258 10258 1390 899 4.0 I have tried coated strings in the past ( incl...
10259 10259 720 899 4.0 Well, MADE by Elixir and DEVELOPED with Taylor...
10260 10260 683 899 4.0 These strings are really quite good, but I wou...

10261 rows × 5 columns

Leave a comment