In [14]:
1
2
3
import pymysql, argparse, os, sys, json
from sqlalchemy import create_engine
import pandas as pd
How to use pymysql using Pandas, MySQL
Step 0. Connect to MySQL
In [2]:
1
2
3
4
5
6
7
8
9
parser = argparse.ArgumentParser()
parser.add_argument('-user', help="mysql database user", type=str, required=False, default='swyoo')
parser.add_argument('-pw', help="password", type=str, required=False, default='****')
parser.add_argument('-host', help="ip address", type=str, required=False, default='***.***.***.***')
parser.add_argument('-db', help="database name", type=str, required=False, default='classicmodels')
parser.add_argument('-charset', help="character set to use", type=str, required=False, default='utf8mb4')
sys.argv = ['-f']
args = parser.parse_args()
args
1
Namespace(charset='utf8mb4', db='classicmodels', host='125.191.6.186', pw='1360', user='swyoo')
In [3]:
1
2
3
4
5
# helper functions
sql = lambda command: pd.read_sql(command, con)
def fetch(command):
cursor.execute(command)
return cursor.fetchall()
In [4]:
1
2
con = pymysql.connect(host=args.host, user=args.user, password=args.pw, use_unicode=True, charset=args.charset)
cursor = con.cursor()
Step 1. Use sqlalchemy, create engine
I use sample database classicmodels
on MySQL in this tutorial
Create engine to convert pandas.Datafame to a table in MySQL.
Therefore, connect to a database classicmodels
on MySQL
In [5]:
1
2
3
4
5
6
fetch("create database if not exists testdb")
""" insert dataset to database """
# db_data = 'mysql+pymysql://' + '<USER-NAME>' + ':' + '<PASSWORD>' + '@' + '***.***.***.***' + ':3306/' + '<DB-NAME>' + '?charset=utf8mb4'
db_data = "mysql+pymysql://{}:{}@{}:3306/{}?charset={}".format(args.user, args.pw, args.host, args.db, args.charset)
engine = create_engine(db_data).connect()
1
2
3
/home/swyoo/anaconda3/envs/torch/lib/python3.7/site-packages/pymysql/cursors.py:170: Warning: (1007, "Can't create database 'testdb'; database exists")
result = self._query(query)
In [6]:
1
2
fetch("use classicmodels")
sql("show tables")
Tables_in_classicmodels | |
---|---|
0 | customers |
1 | employees |
2 | offices |
3 | orderdetails |
4 | orders |
5 | payments |
6 | productlines |
7 | products |
Step2. Prepare dataframe
In [11]:
1
2
!wget -N http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Musical_Instruments_5.json.gz
!gzip -d reviews_Musical_Instruments_5.json.gz
1
2
3
4
5
6
7
8
9
10
11
12
--2020-05-08 18:03:39-- http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Musical_Instruments_5.json.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2460495 (2.3M) [application/x-gzip]
Saving to: ‘reviews_Musical_Instruments_5.json.gz’
reviews_Musical_Ins 100%[===================>] 2.35M 4.85MB/s in 0.5s
2020-05-08 18:03:40 (4.85 MB/s) - ‘reviews_Musical_Instruments_5.json.gz’ saved [2460495/2460495]
In [17]:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
DATA_JSON = "reviews_Musical_Instruments_5.json"
""" read data from *.json """
users_id = []
items_id = []
ratings = []
reviews = []
with open(DATA_JSON, 'r') as f:
for line in f:
js = json.loads(line)
if str(js['reviewerID']) == 'unknown':
print("unknown")
continue
if str(js['asin']) == "unknown":
print("unknown")
continue
reviews.append(js['reviewText'])
users_id.append(str(js['reviewerID']) + ",")
items_id.append(str(js['asin']) + ",")
ratings.append(str(js['overall']))
df = pd.DataFrame(
{'user_id': pd.Series(users_id),
'item_id': pd.Series(items_id),
'ratings': pd.Series(ratings),
'reviews': pd.Series(reviews)}
)[['user_id', 'item_id', 'ratings', 'reviews']]
"""
https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html
user_id column을 pandas.Series.astype 중에서 dtype=category 데이터 타입으로 바꾼후
"""
user_id_int = df.user_id.astype('category').cat.codes
item_id_int = df.item_id.astype('category').cat.codes
df.user_id = user_id_int.astype('int64')
df.item_id = item_id_int.astype('int64')
df.ratings = df.ratings.astype('float64')
In [18]:
1
df
user_id | item_id | ratings | reviews | |
---|---|---|---|---|
0 | 550 | 0 | 5.0 | Not much to write about here, but it does exac... |
1 | 55 | 0 | 5.0 | The product does exactly as it should and is q... |
2 | 92 | 0 | 5.0 | The primary job of this device is to block the... |
3 | 482 | 0 | 5.0 | Nice windscreen protects my MXL mic and preven... |
4 | 1145 | 0 | 5.0 | This pop filter is great. It looks and perform... |
... | ... | ... | ... | ... |
10256 | 45 | 899 | 5.0 | Great, just as expected. Thank to all. |
10257 | 279 | 899 | 5.0 | I've been thinking about trying the Nanoweb st... |
10258 | 1390 | 899 | 4.0 | I have tried coated strings in the past ( incl... |
10259 | 720 | 899 | 4.0 | Well, MADE by Elixir and DEVELOPED with Taylor... |
10260 | 683 | 899 | 4.0 | These strings are really quite good, but I wou... |
10261 rows × 4 columns
Step3. pandas to mysql table using engine
In [20]:
1
2
df.to_sql(name='music', con=engine, if_exists='replace')
print("insert all data set to database done")
1
2
insert all data set to database done
In [21]:
1
sql("select * from music")
index | user_id | item_id | ratings | reviews | |
---|---|---|---|---|---|
0 | 0 | 550 | 0 | 5.0 | Not much to write about here, but it does exac... |
1 | 1 | 55 | 0 | 5.0 | The product does exactly as it should and is q... |
2 | 2 | 92 | 0 | 5.0 | The primary job of this device is to block the... |
3 | 3 | 482 | 0 | 5.0 | Nice windscreen protects my MXL mic and preven... |
4 | 4 | 1145 | 0 | 5.0 | This pop filter is great. It looks and perform... |
... | ... | ... | ... | ... | ... |
10256 | 10256 | 45 | 899 | 5.0 | Great, just as expected. Thank to all. |
10257 | 10257 | 279 | 899 | 5.0 | I've been thinking about trying the Nanoweb st... |
10258 | 10258 | 1390 | 899 | 4.0 | I have tried coated strings in the past ( incl... |
10259 | 10259 | 720 | 899 | 4.0 | Well, MADE by Elixir and DEVELOPED with Taylor... |
10260 | 10260 | 683 | 899 | 4.0 | These strings are really quite good, but I wou... |
10261 rows × 5 columns
Leave a comment