import os
import pandas as pd
import numpy as np
from dateutil.relativedelta import relativedelta
import math
from sqlalchemy import create_engine
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from imblearn.under_sampling import RandomUnderSampler


if 'PG_PORT' in os.environ:

    host = 'db'
    port = os.environ['PG_PORT']
    database = os.environ['PG_DATABASE']
    user = os.environ['PG_USER']
    password = os.environ['PG_PASSWORD']
    
    # pd.read_sql用のコネクタ
    conn = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{database}")

    df_customer = pd.read_sql(sql='select * from customer', con=conn)
    df_category = pd.read_sql(sql='select * from category', con=conn)
    df_product = pd.read_sql(sql='select * from product', con=conn)
    df_receipt = pd.read_sql(sql='select * from receipt', con=conn)
    df_store = pd.read_sql(sql='select * from store', con=conn)
    df_geocode = pd.read_sql(sql='select * from geocode', con=conn)

else:
    if not os.path.exists('../data/'):
        !git clone https://github.com/The-Japan-DataScientist-Society/100knocks-preprocess
        os.chdir('100knocks-preprocess/docker/work/answer')

    dtype = {
        'customer_id': str,
        'gender_cd': str,
        'postal_cd': str,
        'application_store_cd': str,
        'status_cd': str,
        'category_major_cd': str,
        'category_medium_cd': str,
        'category_small_cd': str,
        'product_cd': str,
        'store_cd': str,
        'prefecture_cd': str,
        'tel_no': str,
        'postal_cd': str,
        'street': str
    }
    
    df_customer = pd.read_csv("../data/customer.csv", dtype=dtype)
    df_category = pd.read_csv("../data/category.csv", dtype=dtype)
    df_product = pd.read_csv("../data/product.csv", dtype=dtype)
    df_receipt = pd.read_csv("../data/receipt.csv", dtype=dtype)
    df_store = pd.read_csv("../data/store.csv", dtype=dtype)
    df_geocode = pd.read_csv("../data/geocode.csv", dtype=dtype)


df_receipt.head(10)


df_receipt[['sales_ymd', 'customer_id', 'product_cd', 'amount']].head(10)


df_receipt[['sales_ymd', 'customer_id', 'product_cd', 'amount']]. \
                        rename(columns={'sales_ymd': 'sales_date'}).head(10)


# コード例1（queryを使う場合）
df_receipt[['sales_ymd', 'customer_id', 'product_cd', 'amount']]. \
                                query('customer_id == "CS018205000001"')


# コード例1（queryを使わない場合）
df = df_receipt[['sales_ymd', 'customer_id', 'product_cd', 'amount']]
df[df['customer_id'] == 'CS018205000001']


df_receipt[['sales_ymd', 'customer_id', 'product_cd', 'amount']] \
            .query('customer_id == "CS018205000001" & amount >= 1000')


df_receipt[['sales_ymd', 'customer_id', 'product_cd', 'quantity', 'amount']].\
    query('customer_id == "CS018205000001" & (amount >= 1000 | quantity >=5)')


df_receipt[['sales_ymd', 'customer_id', 'product_cd', 'amount']] \
    .query('customer_id == "CS018205000001" & 1000 <= amount <= 2000')


df_receipt[['sales_ymd', 'customer_id', 'product_cd', 'amount']] \
    .query('customer_id == "CS018205000001" & product_cd != "P071401019"')


df_store.query('prefecture_cd != "13" & floor_area <= 900')


df_store.query("store_cd.str.startswith('S14')", engine='python').head(10)


df_customer.query("customer_id.str.endswith('1')", engine='python').head(10)


df_store.query("address.str.contains('横浜市')", engine='python')


df_customer.query("status_cd.str.contains(r'^[A-F]')", 
                  engine='python').head(10)


# regexのオプションをつけることもできる（Falseにすれば正規表現ではなくそのままの文字列として扱われる）
df_customer.query("status_cd.str.contains(r'[1-9]$', regex=True)", 
                  engine='python').head(10)


df_customer.query("status_cd.str.contains(r'^[A-F].*[1-9]$')", 
                  engine='python').head(10)


df_store.query("tel_no.str.contains(r'^[0-9]{3}-[0-9]{3}-[0-9]{4}$')", 
               engine='python')


df_customer.sort_values('birth_day').head(10)


df_customer.sort_values('birth_day', ascending=False).head(10)


df_tmp = pd.concat([df_receipt[['customer_id', 'amount']] 
                     ,df_receipt['amount'].rank(method='min', 
                                                ascending=False)], axis=1)

df_tmp.columns = ['customer_id', 'amount', 'ranking']

df_tmp.sort_values('ranking').head(10)


df_tmp = pd.concat([df_receipt[['customer_id', 'amount']] 
                     ,df_receipt['amount'].rank(method='first', 
                                                ascending=False)], axis=1)

df_tmp.columns = ['customer_id', 'amount', 'ranking']

df_tmp.sort_values('ranking').head(10)


len(df_receipt)

104681


len(df_receipt['customer_id'].unique())

8307


# コード例1
df_receipt.groupby('store_cd').agg({'amount':'sum', 
                                    'quantity':'sum'}).reset_index()


# コード例2
df_receipt.groupby('store_cd')[['amount','quantity']].agg('sum').reset_index()


df_receipt.groupby('customer_id').agg({'sales_ymd': 'max'}).reset_index().head(10)


# 024と同じ書き方もできるがあえて違う書き方で解答例を紹介
df_receipt.groupby('customer_id').sales_ymd.min().reset_index().head(10)


df_tmp = df_receipt.groupby('customer_id'). \
                agg({'sales_ymd':['max','min']}).reset_index()

# マルチインデックス（項目）の階層を"_"でつなぎながら1階層のインデックス（項目）にする
# df_tmp.columns = ['customer_id', 'sales_ymd_max', 'sales_ymd_min'] としても良い
df_tmp.columns = ["_".join(pair) for pair in df_tmp.columns]

df_tmp.query('sales_ymd_max != sales_ymd_min').head(10)


df_receipt.groupby('store_cd').agg({'amount':'mean'}).reset_index(). \
                            sort_values('amount', ascending=False).head(5)


df_receipt.groupby('store_cd').agg({'amount':'median'}).reset_index(). \
                            sort_values('amount', ascending=False).head(5)


df_receipt.groupby('store_cd').product_cd. \
            apply(lambda x: x.mode()).reset_index().head(10)


df_receipt.groupby('store_cd').amount.var(ddof=0).reset_index(). \
                            sort_values('amount', ascending=False).head(5)


df_receipt.groupby('store_cd').amount.std(ddof=0).reset_index(). \
                            sort_values('amount', ascending=False).head(5)


# コード例1
np.percentile(df_receipt['amount'], q=np.arange(1, 5) * 25)

array([  102.,   170.,   288., 10925.])


# コード例2
df_receipt.amount.quantile(q=np.arange(1, 5) / 4)

0.25      102.0
0.50      170.0
0.75      288.0
1.00    10925.0
Name: amount, dtype: float64


df_receipt.groupby('store_cd').amount.mean(). \
                    reset_index().query('amount >= 330')


# コード例1: queryを使わない書き方
df_receipt[~df_receipt['customer_id'].str.startswith("Z")]. \
                            groupby('customer_id').amount.sum().mean()

2547.742234529256


# コード例2: queryを使う書き方
df_receipt.query('not customer_id.str.startswith("Z")', 
                 engine='python').groupby('customer_id').amount.sum().mean()

2547.742234529256


df_amount_sum = df_receipt[~df_receipt['customer_id'].str.startswith("Z")].\
                                    groupby('customer_id').amount.sum()

amount_mean = df_amount_sum.mean()

df_amount_sum = df_amount_sum.reset_index()

df_amount_sum[df_amount_sum['amount'] >= amount_mean].head(10)


pd.merge(df_receipt, df_store[['store_cd','store_name']], 
         how='inner', on='store_cd').head(10)


pd.merge(df_product
         , df_category[['category_small_cd','category_small_name']]
         , how='inner', on='category_small_cd').head(10)


df_amount_sum = df_receipt.groupby('customer_id').amount.sum().reset_index()

df_tmp = df_customer. \
            query('gender_cd == "1" and not customer_id.str.startswith("Z")', 
                  engine='python')

pd.merge(df_tmp['customer_id'], df_amount_sum, 
         how='left', on='customer_id').fillna(0).head(10)


df_data = df_receipt \
          .query('not customer_id.str.startswith("Z")', engine='python')

df_cnt = df_data[~df_data.duplicated(subset=['customer_id', 'sales_ymd'])] \
         .groupby('customer_id').sales_ymd.count().reset_index() \
         .sort_values('sales_ymd', ascending=False).head(20)

df_sum = df_data.groupby('customer_id').amount.sum().reset_index() \
         .sort_values('amount', ascending=False).head(20)

pd.merge(df_cnt, df_sum, how='outer', on='customer_id')


df_store_tmp = df_store.copy()
df_product_tmp = df_product.copy()

df_store_tmp['key'] = 0
df_product_tmp['key'] = 0

len(pd.merge(df_store_tmp, df_product_tmp, how='outer', on='key'))

531590


df_sales_amount_by_date = df_receipt[['sales_ymd', 'amount']].\
                                groupby('sales_ymd').sum().reset_index()

df_sales_amount_by_date = pd.concat([df_sales_amount_by_date, 
                                     df_sales_amount_by_date.shift()], axis=1)

df_sales_amount_by_date.columns = ['sales_ymd','amount','lag_ymd','lag_amount']

df_sales_amount_by_date['diff_amount'] = \
    df_sales_amount_by_date['amount'] - df_sales_amount_by_date['lag_amount']

df_sales_amount_by_date.head(10)


# コード例1:縦持ちケース
df_sales_amount_by_date = df_receipt[['sales_ymd', 'amount']]. \
                                groupby('sales_ymd').sum().reset_index()

for i in range(1, 4):
    df_tmp = pd.concat([df_sales_amount_by_date, 
                        df_sales_amount_by_date.shift(i)], axis=1)
    if i == 1:
        df_lag = df_tmp
    else:
        # DataFrameでappendが削除されるためappend -> concatに変更
        df_lag = pd.concat([df_lag, df_tmp], axis=0)

df_lag.columns = ['sales_ymd', 'amount', 'lag_ymd', 'lag_amount']

df_lag.dropna().astype(int).sort_values(['sales_ymd','lag_ymd']).head(10)


# コード例2:横持ちケース
df_sales_amount_by_date = df_receipt[['sales_ymd', 'amount']].\
                                groupby('sales_ymd').sum().reset_index()

df_lag = df_sales_amount_by_date

for i in range(1, 4):
    df_lag = pd.concat([df_lag, df_sales_amount_by_date.shift(i)], axis=1)
    columns = [f'lag_ymd_{i}', f'lag_amount_{i}']
    df_lag.columns = list(df_lag.columns)[:-len(columns)] + columns

df_lag.dropna().astype(int).sort_values(['sales_ymd']).head(10)


# コード例1
df_tmp = pd.merge(df_receipt, df_customer, how ='inner', on="customer_id")

df_tmp['era'] = df_tmp['age'].apply(lambda x: math.floor(x / 10) * 10)

df_sales_summary = pd.pivot_table(
                        df_tmp, index='era',
                        columns='gender_cd', 
                        values='amount',
                        aggfunc='sum'
                    ).reset_index()

df_sales_summary.columns = ['era', 'male', 'female', 'unknown']

df_sales_summary


# コード例2
df_tmp = pd.merge(df_receipt, df_customer, how ='inner', on="customer_id")

df_tmp['era'] = np.floor(df_tmp['age'] / 10).astype(int)  * 10

df_sales_summary = pd.pivot_table(df_tmp, index='era', columns='gender_cd', 
                                  values='amount', aggfunc='sum').reset_index()

df_sales_summary.columns = ['era', 'male', 'female', 'unknown']

df_sales_summary


df_sales_summary.set_index('era'). \
    stack().reset_index().replace({'female':'01','male':'00','unknown':'99'}). \
    rename(columns={'level_1':'gender_cd', 0: 'amount'})


# 以下の書き方でYYYYMMDD形式の文字列に変換できる
# pd.to_datetime(df_customer['birth_day']).dt.strftime('%Y%m%d')

pd.concat([df_customer['customer_id'],
           pd.to_datetime(df_customer['birth_day']).dt.strftime('%Y%m%d')],
          axis = 1).head(10)


pd.concat([df_customer['customer_id'],
           pd.to_datetime(df_customer['application_date'])], axis=1).head(10)


pd.concat([df_receipt[['receipt_no', 'receipt_sub_no']],
           pd.to_datetime(df_receipt['sales_ymd'].astype('str'))],
          axis=1).head(10)


pd.concat([df_receipt[['receipt_no', 'receipt_sub_no']],
    pd.to_datetime(df_receipt['sales_epoch'], unit='s').rename('sales_ymd')], 
    axis=1).head(10)


pd.concat([df_receipt[['receipt_no', 'receipt_sub_no']],
           pd.to_datetime(df_receipt['sales_epoch'], 
                          unit='s').dt.year.rename('sales_year')],
          axis=1).head(10)


# dt.monthでも月を取得できるが、ここでは0埋め２桁で取り出すためstrftimeを利用している
df_datetime = pd.to_datetime(df_receipt['sales_epoch'], 
                             unit='s').rename('sales_month')

pd.concat([df_receipt[['receipt_no', 'receipt_sub_no']],
           df_datetime.dt.strftime('%m')],axis=1).head(10)


# dt.dayでも日を取得できるが、ここでは0埋め２桁で取り出すためstrftimeを利用している
df_datetime = pd.to_datetime(df_receipt['sales_epoch'], 
                             unit='s').rename('sales_day')

pd.concat([df_receipt[['receipt_no', 'receipt_sub_no']],
           df_datetime.dt.strftime('%d')], axis=1).head(10)


# コード例1
df_sales_amount = df_receipt.query('not customer_id.str.startswith("Z")', 
                                   engine='python')

df_sales_amount = df_sales_amount[['customer_id', 'amount']]. \
                                    groupby('customer_id').sum().reset_index()

df_sales_amount['sales_flg'] = df_sales_amount['amount']. \
                                    apply(lambda x: 1 if x > 2000 else 0)

df_sales_amount.head(10)


# コード例2（np.whereの活用）
df_sales_amount = df_receipt.query('not customer_id.str.startswith("Z")', 
                                   engine='python')

df_sales_amount = df_sales_amount[['customer_id', 'amount']]. \
                                    groupby('customer_id').sum().reset_index()

df_sales_amount['sales_flg'] = np.where(df_sales_amount['amount'] > 2000, 1, 0)

df_sales_amount.head(10)


# コード例1
df_tmp = df_customer[['customer_id', 'postal_cd']].copy()

df_tmp['postal_flg'] = df_tmp['postal_cd']. \
                        apply(lambda x: 1 if 100 <= int(x[0:3]) <= 209 else 0)

pd.merge(df_tmp, df_receipt, how='inner', on='customer_id'). \
    groupby('postal_flg').agg({'customer_id':'nunique'})


# コード例2（np.where、betweenの活用）
df_tmp = df_customer[['customer_id', 'postal_cd']].copy()

df_tmp['postal_flg'] = np.where(df_tmp['postal_cd'].str[0:3].astype(int)
                                .between(100, 209), 1, 0)

pd.merge(df_tmp, df_receipt, how='inner', on='customer_id'). \
    groupby('postal_flg').agg({'customer_id':'nunique'})


# コード例1（固定で切り出す）
df_customer_tmp = df_customer[['customer_id', 'address']].copy()

df_customer_tmp['prefecture_cd'] = \
            df_customer['address'].str[0:3].map({'埼玉県': '11',
                                                '千葉県':'12', 
                                                '東京都':'13', 
                                                '神奈川':'14'})

df_customer_tmp.head(10)


# コード例2（正規表現を使う）
df_customer_tmp = df_customer[['customer_id', 'address']].copy()

df_customer_tmp['prefecture_cd'] = \
            df_customer['address'].str.extract(r'(^.*?[都道府県])')[0].\
                                            map({'埼玉県': '11',
                                                 '千葉県':'12', 
                                                 '東京都':'13', 
                                                 '神奈川県':'14'})

df_customer_tmp.head(10)


# コード例1
df_sales_amount = df_receipt[['customer_id', 'amount']]. \
                        groupby('customer_id').sum().reset_index()

pct25 = np.quantile(df_sales_amount['amount'], 0.25)
pct50 = np.quantile(df_sales_amount['amount'], 0.5)
pct75 = np.quantile(df_sales_amount['amount'], 0.75)

def pct_group(x):
    if x < pct25:
        return 1
    elif pct25 <= x < pct50:
        return 2
    elif pct50 <= x < pct75:
        return 3
    elif pct75 <= x:
        return 4

df_sales_amount['pct_group'] = df_sales_amount['amount'].apply(pct_group)

df_sales_amount.head(10)


# 確認用コード
print('pct25:', pct25)
print('pct50:', pct50)
print('pct75:', pct75)

pct25: 548.5
pct50: 1478.0
pct75: 3651.0


# コード例2（cutを使った例、四分位範囲も参考までに追加表示）
df_temp = df_receipt[['customer_id', 'amount']]. \
                        groupby('customer_id').sum().reset_index()

pct25 = np.quantile(df_sales_amount['amount'], 0.25)
pct50 = np.quantile(df_sales_amount['amount'], 0.5)
pct75 = np.quantile(df_sales_amount['amount'], 0.75)
pct_max = df_sales_amount['amount'].max()

df_temp['quantile'] = pd.cut(df_sales_amount['amount'],[0.0, pct25, pct50, pct75,pct_max+0.1], right=False)

df_temp['pct_group'] = df_temp.groupby('quantile').ngroup() + 1

df_temp.head(10)


# 参考コード（qcutを使った例、境界値の含む／含まないが逆になっており題意を満たさないが参考までに記載）
df_temp = df_receipt.groupby('customer_id')[['amount']].sum()

df_temp['quantile'], bins = \
    pd.qcut(df_receipt.groupby('customer_id')['amount'].sum(), 4, retbins=True) 

df_temp['pct_group'] = df_temp.groupby('quantile').ngroup() + 1

df_temp.reset_index(inplace=True)

display(df_temp.head(10))

print('quantiles:', bins)

quantiles: [7.0000000e+01 5.4850000e+02 1.4780000e+03 3.6510000e+03 1.2395003e+07]


# コード例1
df_customer_era = df_customer[['customer_id', 'birth_day']].copy()

df_customer_era['era'] = df_customer['age']. \
                                apply(lambda x: min(math.floor(x / 10) * 10, 60))

df_customer_era.head(10)


# コード例2（cutの例、カテゴリは範囲で出力）
df_customer_era = df_customer[['customer_id', 'birth_day']].copy()

df_customer_era['era'] = pd.cut(df_customer['age'], 
                                  bins=[0, 10, 20, 30, 40, 50, 60, np.inf], 
                                  right=False)

df_customer_era[['customer_id', 'birth_day', 'era']].head(10)


# 性別コード１桁と年代コード２桁を連結した性年代コードを生成する

df_customer_era = df_customer[['customer_id', 'birth_day']].copy()

df_customer_era['era'] = df_customer['age']. \
                                apply(lambda x: min(math.floor(x / 10) * 10, 60))

df_customer_era['gender_era'] = \
    df_customer['gender_cd'] + df_customer_era['era'].astype('str').str.zfill(2)

df_customer_era.head(10)


# コード例1（すべてのコード値を項目化）
pd.get_dummies(df_customer[['customer_id', 'gender_cd']], 
               columns=['gender_cd']).head(10)


# コード例2（項目を一つ削ったり区切り文字を変えたりできる）
pd.get_dummies(df_customer[['customer_id', 'gender_cd']], 
               columns=['gender_cd'], 
               drop_first=True, prefix='gen', prefix_sep='#').head(10)


# skleanのpreprocessing.scaleを利用するため、データの標準偏差で計算されている
df_sales_amount = df_receipt.query('not customer_id.str.startswith("Z")', 
                                   engine='python'). \
                                   groupby('customer_id'). \
                                   agg({'amount':'sum'}).reset_index()

df_sales_amount['std_amount'] = preprocessing.scale(df_sales_amount['amount'])

df_sales_amount.head(10)


# コード例2（fitを行うことで、別のデータでも同じ平均・標準偏差で標準化を行える）
df_sales_amount = df_receipt.query('not customer_id.str.startswith("Z")', 
                                   engine='python'). \
                                   groupby('customer_id'). \
                                   agg({'amount':'sum'}).reset_index()

scaler = preprocessing.StandardScaler()

scaler.fit(df_sales_amount[['amount']])

df_sales_amount['std_amount'] = scaler.transform(df_sales_amount[['amount']])

df_sales_amount.head(10)


# コード例1
df_sales_amount = df_receipt.query('not customer_id.str.startswith("Z")', 
                                   engine='python'). \
                                   groupby('customer_id'). \
                                   agg({'amount':'sum'}).reset_index()

df_sales_amount['scale_amount'] = \
                        preprocessing.minmax_scale(df_sales_amount['amount'])

df_sales_amount.head(10)


# コード例2（fitを行うことで、別のデータでも同じ最小値・最大値で標準化を行える）
df_sales_amount = df_receipt.query('not customer_id.str.startswith("Z")', 
                                   engine='python'). \
                                   groupby('customer_id'). \
                                   agg({'amount':'sum'}).reset_index()

scaler = preprocessing.MinMaxScaler()

scaler.fit(df_sales_amount[['amount']])

df_sales_amount['scale_amount'] = scaler.transform(df_sales_amount[['amount']])

df_sales_amount.head(10)


df_sales_amount = df_receipt.query('not customer_id.str.startswith("Z")', 
                                   engine='python'). \
                                   groupby('customer_id'). \
                                   agg({'amount':'sum'}).reset_index()

df_sales_amount['log_amount'] = np.log10(df_sales_amount['amount'] + 0.5)

df_sales_amount.head(10)


df_sales_amount = df_receipt.query('not customer_id.str.startswith("Z")', 
                                   engine='python'). \
                                   groupby('customer_id'). \
                                   agg({'amount':'sum'}).reset_index()

df_sales_amount['log_amount'] = np.log(df_sales_amount['amount'] + 0.5)

df_sales_amount.head(10)


df_tmp = df_product.copy()

df_tmp['unit_profit'] = df_tmp['unit_price'] - df_tmp['unit_cost']

df_tmp.head(10)


df_tmp = df_product.copy()

df_tmp['unit_profit_rate'] = \
            (df_tmp['unit_price'] - df_tmp['unit_cost']) / df_tmp['unit_price']

df_tmp['unit_profit_rate'].mean(skipna=True)

0.24911389885177


df_tmp = df_product[['product_cd', 'unit_price', 'unit_cost']].copy()

df_tmp['new_price'] = np.floor(df_tmp['unit_cost'] / 0.7)

df_tmp['new_profit_rate'] = \
            (df_tmp['new_price'] - df_tmp['unit_cost']) / df_tmp['new_price']

df_tmp.head(10)


df_tmp = df_product[['product_cd', 'unit_price', 'unit_cost']].copy()

df_tmp['new_price'] = np.round(df_tmp['unit_cost'] / 0.7)

df_tmp['new_profit_rate'] = \
            (df_tmp['new_price'] - df_tmp['unit_cost']) / df_tmp['new_price']

df_tmp.head(10)


df_tmp = df_product[['product_cd', 'unit_price', 'unit_cost']].copy()

df_tmp['new_price'] = np.ceil(df_tmp['unit_cost'] / 0.7)

df_tmp['new_profit_rate'] = \
            (df_tmp['new_price'] - df_tmp['unit_cost']) / df_tmp['new_price']

df_tmp.head(10)


df_tmp = df_tmp = df_product[['product_cd', 'unit_price']].copy()

df_tmp['tax_price'] = np.floor(df_tmp['unit_price'] * 1.1)

df_tmp.head(10)


# コード例1
df_tmp_1 = df_receipt.groupby('customer_id').agg({'amount':'sum'}). \
                reset_index().rename(columns={'amount':'sum_all'})

df_tmp_2 = pd.merge(df_receipt, df_product.query('category_major_cd == "07"'), 
                how='inner', on='product_cd').groupby('customer_id').\
                agg({'amount':'sum'}).reset_index().\
                rename(columns={'amount':'sum_07'})

df_tmp_3 = pd.merge(df_tmp_1, df_tmp_2, how='inner', on='customer_id')

df_tmp_3['sales_rate'] = df_tmp_3['sum_07'] / df_tmp_3['sum_all']

df_tmp_3.head(10)


# コード例2（参考、unstackと横方向のsumを使った例）
df_temp = df_receipt.merge(df_product, how='left', on='product_cd'). \
        groupby(['customer_id', 'category_major_cd'])['amount'].sum().unstack()

df_temp = df_temp[df_temp['07'] > 0]

df_temp['sum_all'] = df_temp.sum(axis=1)

df_temp['sales_rate'] = df_temp['07'] / df_temp['sum_all']

# 以降はデータフレームの整形と表示のための処理
df_temp.columns.name = ''

df_temp = df_temp.reset_index()

df_temp.head(10)


df_tmp = df_receipt[['customer_id', 'sales_ymd']].drop_duplicates()

df_tmp = pd.merge(df_tmp, df_customer[['customer_id', 'application_date']],
                  how='inner', on='customer_id')

df_tmp['sales_ymd'] = pd.to_datetime(df_tmp['sales_ymd'].astype('str'))

df_tmp['application_date'] = pd.to_datetime(df_tmp['application_date'])

df_tmp['elapsed_days'] = df_tmp['sales_ymd'] - df_tmp['application_date']

df_tmp['elapsed_days'] = df_tmp['elapsed_days'].dt.days

df_tmp.head(10)


df_tmp = df_receipt[['customer_id', 'sales_ymd']].drop_duplicates()

df_tmp = pd.merge(df_tmp, df_customer[['customer_id', 'application_date']],
                  how='inner', on='customer_id')

df_tmp['sales_ymd'] = pd.to_datetime(df_tmp['sales_ymd'].astype('str'))

df_tmp['application_date'] = pd.to_datetime(df_tmp['application_date'])

df_tmp['elapsed_months'] = df_tmp[['sales_ymd', 'application_date']]. \
    apply(lambda x: relativedelta(x[0], x[1]).years * 12 + \
                    relativedelta(x[0], x[1]).months, axis=1)

df_tmp.head(10)


df_tmp = df_receipt[['customer_id', 'sales_ymd']].drop_duplicates()

df_tmp = pd.merge(df_tmp, df_customer[['customer_id', 'application_date']],
                  how='inner', on='customer_id')

df_tmp['sales_ymd'] = pd.to_datetime(df_tmp['sales_ymd'].astype('str'))

df_tmp['application_date'] = pd.to_datetime(df_tmp['application_date'])

df_tmp['elapsed_years'] = df_tmp[['sales_ymd', 'application_date']]. \
                    apply(lambda x: relativedelta(x[0], x[1]).years, axis=1)

df_tmp.head(10)


df_tmp = df_receipt[['customer_id', 'sales_ymd']].drop_duplicates()

df_tmp = pd.merge(df_tmp, df_customer[['customer_id', 'application_date']],
                  how='inner', on='customer_id')

df_tmp['sales_ymd'] = pd.to_datetime(df_tmp['sales_ymd'].astype('str'))

df_tmp['application_date'] = pd.to_datetime(df_tmp['application_date'])

df_tmp['elapsed_epoch'] = df_tmp['sales_ymd'].view(np.int64) - \
                                    df_tmp['application_date'].view(np.int64)

df_tmp['elapsed_epoch'] = df_tmp['elapsed_epoch'] / 10**9

df_tmp.head(10)


df_tmp = df_receipt[['sales_ymd']].copy()

df_tmp['sales_ymd'] = pd.to_datetime(df_tmp['sales_ymd'].astype('str'))

df_tmp['elapsed_days']  = df_tmp['sales_ymd'].apply(lambda x:x.weekday())

df_tmp['monday'] = \
    df_tmp['sales_ymd'].apply(lambda x: x - relativedelta(days=x.weekday()))

df_tmp.head(10)


df_customer.sample(frac=0.01).head(10)


# sklearn.model_selection.train_test_splitを使用した例
_, df_tmp = train_test_split(df_customer, test_size=0.1, 
                             stratify=df_customer['gender_cd'])

df_tmp.groupby('gender_cd').agg({'customer_id' : 'count'})


df_sales_amount = df_receipt.groupby('customer_id'). \
                                agg({'amount':'sum'}).reset_index()

df_sales_amount['log_sum_amount'] = np.log(df_sales_amount['amount'] + 0.5)

df_sales_amount['log_sum_amount_ss'] = preprocessing.scale(df_sales_amount['log_sum_amount'])

df_sales_amount.query('abs(log_sum_amount_ss) > 3').head(10)


df_sales_amount = df_receipt.query('not customer_id.str.startswith("Z")', 
                                   engine='python'). \
                                   groupby('customer_id'). \
                                   agg({'amount':'sum'}).reset_index()

pct25 = np.percentile(df_sales_amount['amount'], q=25)
pct75 = np.percentile(df_sales_amount['amount'], q=75)

iqr = pct75 - pct25
amount_low = pct25 - (iqr * 1.5)
amount_hight = pct75 + (iqr * 1.5)

df_sales_amount.query('amount < @amount_low or @amount_hight < amount').head(10)


df_product.isnull().sum()

product_cd            0
category_major_cd     0
category_medium_cd    0
category_small_cd     0
unit_price            7
unit_cost             7
dtype: int64


df_product_1 = df_product.copy()

df_product_1.dropna(inplace=True)

print('削除前:', len(df_product))
print('削除後:', len(df_product_1))

削除前: 10030
削除後: 10023


# コード例1（Pandasのfillna）
df_product_2 = df_product.fillna({
    'unit_price':np.round(np.nanmean(df_product['unit_price'])), 
    'unit_cost':np.round(np.nanmean(df_product['unit_cost']))})

df_product_2.isnull().sum()

product_cd            0
category_major_cd     0
category_medium_cd    0
category_small_cd     0
unit_price            0
unit_cost             0
dtype: int64


# コード例2（scikit-learnのSimpleImputer）
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

imp_values = imp_mean.fit_transform(df_product[['unit_price', 'unit_cost']])

df_product_2 = df_product.copy()

df_product_2[['unit_price', 'unit_cost']] = imp_values.round()

df_product_2.isnull().sum()

product_cd            0
category_major_cd     0
category_medium_cd    0
category_small_cd     0
unit_price            0
unit_cost             0
dtype: int64


# コード例1（Pandasのfillna）
df_product_3 = df_product.fillna({
    'unit_price':np.round(np.nanmedian(df_product['unit_price'])), 
    'unit_cost':np.round(np.nanmedian(df_product['unit_cost']))})

df_product_3.isnull().sum()

product_cd            0
category_major_cd     0
category_medium_cd    0
category_small_cd     0
unit_price            0
unit_cost             0
dtype: int64


# コード例2（scikit-learnのSimpleImputer）
imp_mean = SimpleImputer(missing_values=np.nan, strategy='median')

imp_values = imp_mean.fit_transform(df_product[['unit_price', 'unit_cost']])

df_product_3 = df_product.copy()

df_product_3[['unit_price', 'unit_cost']] = imp_values.round()

df_product_3.isnull().sum()

product_cd            0
category_major_cd     0
category_medium_cd    0
category_small_cd     0
unit_price            0
unit_cost             0
dtype: int64


# コード例1
df_tmp = (df_product.groupby('category_small_cd')
          .agg(median_price=('unit_price', 'median'), 
               median_cost=('unit_cost', 'median')).reset_index())

df_product_4 = pd.merge(df_product, df_tmp, how='inner', on='category_small_cd')

df_product_4['unit_price'] = df_product_4[['unit_price', 'median_price']]. \
            apply(lambda x: np.round(x[1]) if np.isnan(x[0]) else x[0], axis=1)

df_product_4['unit_cost'] = df_product_4[['unit_cost', 'median_cost']]. \
            apply(lambda x: np.round(x[1]) if np.isnan(x[0]) else x[0], axis=1)

df_product_4.isnull().sum()

product_cd            0
category_major_cd     0
category_medium_cd    0
category_small_cd     0
unit_price            0
unit_cost             0
median_price          0
median_cost           0
dtype: int64


# コード例2（maskの活用）
df_tmp = (df_product.groupby('category_small_cd')
          .agg(median_price=('unit_price', 'median'), 
               median_cost=('unit_cost', 'median')).reset_index())

df_product_4 = df_product.merge(df_tmp, how='inner', on='category_small_cd')

df_product_4['unit_price'] = (df_product_4['unit_price']
                              .mask(df_product_4['unit_price'].isnull(), 
                                    df_product_4['median_price'].round()))

df_product_4['unit_cost'] = (df_product_4['unit_cost']
                              .mask(df_product_4['unit_cost'].isnull(), 
                                    df_product_4['median_cost'].round()))

df_product_4.isnull().sum()

product_cd            0
category_major_cd     0
category_medium_cd    0
category_small_cd     0
unit_price            0
unit_cost             0
median_price          0
median_cost           0
dtype: int64


# コード例3（fillna、transformの活用）
df_product_4 = df_product.copy()

for x in ['unit_price', 'unit_cost']: 
    df_product_4[x] = (df_product_4[x]
                       .fillna(df_product_4.groupby('category_small_cd')[x]
                               .transform('median')
                               .round()))

df_product_4.isnull().sum()

product_cd            0
category_major_cd     0
category_medium_cd    0
category_small_cd     0
unit_price            0
unit_cost             0
dtype: int64


df_receipt_2019 = df_receipt.query('20190101 <= sales_ymd <= 20191231') \
                            .groupby('customer_id') \
                            .agg(amount_2019=('amount', 'sum')) \
                            .reset_index()

df_receipt_all = df_receipt.groupby('customer_id')\
                            .agg(amount_all=('amount', 'sum')) \
                            .reset_index()

df_sales_rate = df_customer[['customer_id']] \
                    .merge(df_receipt_2019, how='left', on='customer_id') \
                    .merge(df_receipt_all, how='left', on='customer_id')

df_sales_rate['amount_2019'] = df_sales_rate['amount_2019'].fillna(0)
df_sales_rate['amount_all'] = df_sales_rate['amount_all'].fillna(0)

df_sales_rate['amount_rate'] = \
                df_sales_rate[['amount_2019','amount_all']] \
                    .apply(lambda x: 0 if x[0] == 0 else x[0] / x[1], axis=1)

df_sales_rate['amount_rate'] = df_sales_rate['amount_rate'].fillna(0)

df_sales_rate.query('amount_rate > 0').head(10)


df_tmp.isnull().sum()

category_small_cd    0
median_price         0
median_cost          0
dtype: int64


df_geocode_1 = df_geocode.groupby('postal_cd') \
                .agg(m_longitude=('longitude', 'mean'), 
                     m_latitude=('latitude', 'mean')).reset_index()

df_customer_1 = pd.merge(df_customer, df_geocode_1, 
                         how='inner', on='postal_cd')

df_customer_1.head(10)


# コード例1
def calc_distance(x1, y1, x2, y2):
    distance = 6371 * math.acos(math.sin(math.radians(x1)) 
                    * math.sin(math.radians(x2)) 
                    + math.cos(math.radians(x1)) 
                    * math.cos(math.radians(x2)) 
                    * math.cos(math.radians(y1) - math.radians(y2)))
    return distance

df_tmp = pd.merge(df_customer_1, df_store, 
                  how='inner', 
                  left_on='application_store_cd', 
                  right_on='store_cd') \
        .rename(columns={'address_x':'customer_address', 
                         'address_y':'store_address'})

df_tmp['distance'] =   df_tmp[['m_latitude', 
                               'm_longitude',
                               'latitude', 
                               'longitude']] \
                        .apply(lambda x: calc_distance(x[0], x[1], x[2], x[3]), 
                               axis=1)

df_tmp[['customer_id', 'customer_address', 
        'store_address', 'distance']].head(10)


# コード例2
def calc_distance_numpy(x1, y1, x2, y2):
    x1_r = np.radians(x1)
    x2_r = np.radians(x2)
    y1_r = np.radians(y1)
    y2_r = np.radians(y2)
    return 6371 * np.arccos(np.sin(x1_r) * np.sin(x2_r) 
                            + np.cos(x1_r) * np.cos(x2_r) 
                            * np.cos(y1_r - y2_r))

df_tmp = df_customer_1.merge(df_store, 
                             how='inner', 
                             left_on='application_store_cd', 
                             right_on='store_cd') \
                      .rename(columns={'address_x':'customer_address', 
                                       'address_y':'store_address'})

df_tmp['distance'] =  calc_distance_numpy(df_tmp['m_latitude'], 
                                          df_tmp['m_longitude'],
                                          df_tmp['latitude'], 
                                          df_tmp['longitude'])

df_tmp[['customer_id', 'customer_address', 
        'store_address', 'distance']].head(10)


df_receipt_tmp = df_receipt.groupby('customer_id') \
                    .agg(sum_amount=('amount','sum')).reset_index()

df_customer_u = pd.merge(df_customer, df_receipt_tmp, 
                         how='left', 
                         on='customer_id')

df_customer_u['sum_amount'] = df_customer_u['sum_amount'].fillna(0)

df_customer_u = df_customer_u.sort_values(['sum_amount', 'customer_id'], 
                                          ascending=[False, True])

df_customer_u.drop_duplicates(subset=['customer_name', 'postal_cd'], 
                              keep='first', inplace=True)

print('df_customer_cnt:', len(df_customer),
      'df_customer_u_cnt:', len(df_customer_u),
      'diff:', len(df_customer) - len(df_customer_u))

df_customer_cnt: 21971 df_customer_u_cnt: 21941 diff: 30


df_customer_n = pd.merge(df_customer, 
                         df_customer_u[['customer_name', 
                                        'postal_cd', 'customer_id']],
                         how='inner', on =['customer_name', 'postal_cd'])

df_customer_n.rename(columns={'customer_id_x':'customer_id', 
                              'customer_id_y':'integration_id'}, inplace=True)

print('ID数の差', len(df_customer_n['customer_id'].unique()) 
                         - len(df_customer_n['integration_id'].unique()))

ID数の差 30


del df_customer_1
del df_customer_n


df_sales_customer = df_receipt.groupby('customer_id') \
                                    .agg({'amount':sum}).reset_index()

df_sales_customer = df_sales_customer.query('amount > 0')

df_tmp = pd.merge(df_customer, df_sales_customer['customer_id'], 
                  how='inner', on='customer_id')

df_train, df_test = train_test_split(df_tmp, test_size=0.2, random_state=71)

print('学習データ割合: ', len(df_train) / len(df_tmp))
print('テストデータ割合: ', len(df_test) / len(df_tmp))

学習データ割合:  0.7999036840837949
テストデータ割合:  0.20009631591620516


# コード例1（自作関数）
df_ts_amount = df_receipt[['sales_ymd', 'amount']].copy()

df_ts_amount['sales_ym'] = df_ts_amount['sales_ymd'].astype('str').str[0:6]

df_ts_amount = df_ts_amount.groupby('sales_ym') \
                                        .agg({'amount':'sum'}).reset_index()

# 長期間データに対する多数のデータセットもループなどで処理できるように関数化
def split_data(df, train_size, test_size, slide_window, start_point):
    train_start = start_point * slide_window
    test_start = train_start + train_size
    return df[train_start:test_start], df[test_start:test_start + test_size]

df_train_1, df_test_1 = split_data(df_ts_amount, train_size=12, 
                                   test_size=6, slide_window=6, start_point=0)

df_train_2, df_test_2 = split_data(df_ts_amount, train_size=12, 
                                   test_size=6, slide_window=6, start_point=1)

df_train_3, df_test_3 = split_data(df_ts_amount, train_size=12, 
                                   test_size=6, slide_window=6, start_point=2)


# df_train_2とdf_train_3の表示は割愛
df_train_1


# df_test_2とdf_test_３の表示は割愛
df_test_1


# コード例2（scikit-learnのTimeSeriesSplit）
tscv = TimeSeriesSplit(gap=0, max_train_size=12, n_splits=3, test_size=6)

# TimeSeriesSplitは最新のデータが使われるように分割されるが、
# SQL、Rの解答例と同じとなるようにデータ期間を調整
# できる限り最新データを使うようにするなら不要
df_ts_amount = df_ts_amount.query('sales_ym <= "201906"')

series_list = []
for train_index, test_index in tscv.split(df_ts_amount):
    series_list.append((df_ts_amount.loc[train_index], 
                        df_ts_amount.loc[test_index]))
    
df_train_1, df_test_1 = series_list[0]
df_train_2, df_test_2 = series_list[1]
df_train_3, df_test_3 = series_list[2]


# df_train_2とdf_train_3の表示は割愛
df_train_1


# df_test_2とdf_test_３の表示は割愛
df_test_1


df_tmp = df_receipt.groupby('customer_id').agg({'amount':'sum'}).reset_index()

df_tmp = pd.merge(df_customer, df_tmp, how='left', on='customer_id')

df_tmp['is_buy_flag'] = np.where(df_tmp['amount'].isnull(), 0, 1)

rs = RandomUnderSampler(random_state=71)

df_down_sampling, _ = rs.fit_resample(df_tmp, df_tmp.is_buy_flag)

print('0の件数', len(df_down_sampling.query('is_buy_flag == 0')))
print('1の件数', len(df_down_sampling.query('is_buy_flag == 1')))

0の件数 8306
1の件数 8306


df_gender_std = df_customer[['gender_cd', 'gender']].drop_duplicates()

df_customer_std = df_customer.drop(columns='gender')


# データの内容確認
df_customer_std.head(3)


# データの内容確認
df_gender_std.head(3)


df_product_full = pd.merge(df_product, df_category[['category_small_cd', 
                                                    'category_major_name',
                                                    'category_medium_name',
                                                    'category_small_name']], 
                           how = 'inner', on = 'category_small_cd')

# データの内容確認
df_product_full.head(3)


# コード例1
# 解答ファイルの置き場所が設問ファイルと異なるため、パスが'../data'となっている点に注意
df_product_full.to_csv('../data/P_df_product_full_UTF-8_header.csv', 
                       encoding='UTF-8', index=False)


# コード例2（BOM付きでExcelの文字化けを防ぐ）
df_product_full.to_csv('../data/P_df_product_full_UTF-8BOM_header.csv', 
                       encoding='utf_8_sig', index=False)


# 解答ファイルの置き場所が設問ファイルと異なるため、パスが'../data'となっている点に注意
df_product_full.to_csv('../data/P_df_product_full_CP932_header.csv', 
                       encoding='CP932', index=False)


# 解答ファイルの置き場所が設問ファイルと異なるため、パスが'../data'となっている点に注意
df_product_full.to_csv('../data/P_df_product_full_UTF-8_noh.csv', 
                       header=False, encoding='UTF-8', index=False)


# 解答ファイルの置き場所が設問ファイルと異なるため、パスが'../data'となっている点に注意
df_product_full = pd.read_csv('../data/P_df_product_full_UTF-8_header.csv',
                    dtype={'category_major_cd':str,
                           'category_medium_cd':str,
                           'category_small_cd':str},
                    encoding='UTF-8')

df_product_full.head(3)


# コード例1（後から項目名をつける）
# 解答ファイルの置き場所が設問ファイルと異なるため、パスが'../data'となっている点に注意
df_product_full = pd.read_csv('../data/P_df_product_full_UTF-8_noh.csv',
                     dtype={1:str,
                            2:str,
                            3:str},
                     encoding='UTF-8', header=None)

df_product_full.columns = ['product_cd','category_major_cd',
                           'category_medium_cd', 'category_small_cd',
                           'unit_price','unit_cost','category_major_name',
                           'category_medium_name', 'category_small_name']

df_product_full.head(3)


# コード例2（先に項目名を定義する）
c_names = ['product_cd','category_major_cd','category_medium_cd',
           'category_small_cd','unit_price','unit_cost',
           'category_major_name','category_medium_name','category_small_name']

df_product_full = pd.read_csv('../data/P_df_product_full_UTF-8_noh.csv',
                     names=c_names,
                     dtype={'category_major_cd':str,
                            'category_medium_cd':str,
                            'category_small_cd':str},
                     encoding='UTF-8', header=None)

df_product_full.head(3)


# 解答ファイルの置き場所が設問ファイルと異なるため、パスが'../data'となっている点に注意
df_product_full.to_csv('../data/P_df_product_full_UTF-8_header.tsv', 
                       sep='\t', encoding='UTF-8', index=False)


# コード例1（read_table）
# 解答ファイルの置き場所が設問ファイルと異なるため、パスが'../data'となっている点に注意
df_product_full = pd.read_table('../data/P_df_product_full_UTF-8_header.tsv', 
                       dtype={'category_major_cd':str,
                              'category_medium_cd':str,
                              'category_small_cd':str},
                       encoding='UTF-8')
df_product_full.head(3)


# コード例2（read_csv）
df_product_full = pd.read_csv('../data/P_df_product_full_UTF-8_header.tsv', 
                       dtype={'category_major_cd':str,
                              'category_medium_cd':str,
                              'category_small_cd':str},
                       sep='\t', encoding='UTF-8')
df_product_full.head(3)

	sales_ymd	sales_epoch	store_cd	receipt_no	receipt_sub_no	customer_id	product_cd	quantity	amount
0	20181103	1541203200	S14006	112	1	CS006214000001	P070305012	1	158
1	20181118	1542499200	S13008	1132	2	CS008415000097	P070701017	1	81
2	20170712	1499817600	S14028	1102	1	CS028414000014	P060101005	1	170
3	20190205	1549324800	S14042	1132	1	ZZ000000000000	P050301001	1	25
4	20180821	1534809600	S14025	1102	2	CS025415000050	P060102007	1	90
5	20190605	1559692800	S13003	1112	1	CS003515000195	P050102002	1	138
6	20181205	1543968000	S14024	1102	2	CS024514000042	P080101005	1	30
7	20190922	1569110400	S14040	1102	1	CS040415000178	P070501004	1	128
8	20170504	1493856000	S13020	1112	2	ZZ000000000000	P071302010	1	770
9	20191010	1570665600	S14027	1102	1	CS027514000015	P071101003	1	680

	sales_ymd	customer_id	product_cd	amount
0	20181103	CS006214000001	P070305012	158
1	20181118	CS008415000097	P070701017	81
2	20170712	CS028414000014	P060101005	170
3	20190205	ZZ000000000000	P050301001	25
4	20180821	CS025415000050	P060102007	90
5	20190605	CS003515000195	P050102002	138
6	20181205	CS024514000042	P080101005	30
7	20190922	CS040415000178	P070501004	128
8	20170504	ZZ000000000000	P071302010	770
9	20191010	CS027514000015	P071101003	680

	sales_date	customer_id	product_cd	amount
0	20181103	CS006214000001	P070305012	158
1	20181118	CS008415000097	P070701017	81
2	20170712	CS028414000014	P060101005	170
3	20190205	ZZ000000000000	P050301001	25
4	20180821	CS025415000050	P060102007	90
5	20190605	CS003515000195	P050102002	138
6	20181205	CS024514000042	P080101005	30
7	20190922	CS040415000178	P070501004	128
8	20170504	ZZ000000000000	P071302010	770
9	20191010	CS027514000015	P071101003	680

	sales_ymd	customer_id	product_cd	amount
36	20180911	CS018205000001	P071401012	2200
9843	20180414	CS018205000001	P060104007	600
21110	20170614	CS018205000001	P050206001	990
27673	20170614	CS018205000001	P060702015	108
27840	20190216	CS018205000001	P071005024	102
28757	20180414	CS018205000001	P071101002	278
39256	20190226	CS018205000001	P070902035	168
58121	20190924	CS018205000001	P060805001	495
68117	20190226	CS018205000001	P071401020	2200
72254	20180911	CS018205000001	P071401005	1100
88508	20190216	CS018205000001	P040101002	218
91525	20190924	CS018205000001	P091503001	280

	sales_ymd	customer_id	product_cd	amount
36	20180911	CS018205000001	P071401012	2200
9843	20180414	CS018205000001	P060104007	600
21110	20170614	CS018205000001	P050206001	990
27673	20170614	CS018205000001	P060702015	108
27840	20190216	CS018205000001	P071005024	102
28757	20180414	CS018205000001	P071101002	278
39256	20190226	CS018205000001	P070902035	168
58121	20190924	CS018205000001	P060805001	495
68117	20190226	CS018205000001	P071401020	2200
72254	20180911	CS018205000001	P071401005	1100
88508	20190216	CS018205000001	P040101002	218
91525	20190924	CS018205000001	P091503001	280

データサイエンス100本ノック（構造化データ加工編） - Python¶

はじめに¶

演習問題¶

これで１００本終わりです。おつかれさまでした！¶

	store_cd	store_name	prefecture_cd	prefecture	address	address_kana	tel_no	longitude	latitude	floor_area
18	S14046	北山田店	14	神奈川県	神奈川県横浜市都筑区北山田一丁目	カナガワケンヨコハマシツヅキクキタヤマタイッチョウメ	045-123-4049	139.5916	35.56189	831.0
20	S14011	日吉本町店	14	神奈川県	神奈川県横浜市港北区日吉本町四丁目	カナガワケンヨコハマシコウホククヒヨシホンチョウヨンチョウメ	045-123-4033	139.6316	35.54655	890.0
38	S12013	習志野店	12	千葉県	千葉県習志野市芝園一丁目	チバケンナラシノシシバゾノイッチョウメ	047-123-4002	140.0220	35.66122	808.0

	store_cd	store_name	prefecture_cd	prefecture	address	address_kana	tel_no	longitude	latitude	floor_area
2	S14010	菊名店	14	神奈川県	神奈川県横浜市港北区菊名一丁目	カナガワケンヨコハマシコウホククキクナイッチョウメ	045-123-4032	139.6326	35.50049	1732.0
3	S14033	阿久和店	14	神奈川県	神奈川県横浜市瀬谷区阿久和西一丁目	カナガワケンヨコハマシセヤクアクワニシイッチョウメ	045-123-4043	139.4961	35.45918	1495.0
4	S14036	相模原中央店	14	神奈川県	神奈川県相模原市中央二丁目	カナガワケンサガミハラシチュウオウニチョウメ	042-123-4045	139.3716	35.57327	1679.0
7	S14040	長津田店	14	神奈川県	神奈川県横浜市緑区長津田みなみ台五丁目	カナガワケンヨコハマシミドリクナガツタミナミダイゴチョウメ	045-123-4046	139.4994	35.52398	1548.0
9	S14050	阿久和西店	14	神奈川県	神奈川県横浜市瀬谷区阿久和西一丁目	カナガワケンヨコハマシセヤクアクワニシイッチョウメ	045-123-4053	139.4961	35.45918	1830.0
12	S14028	二ツ橋店	14	神奈川県	神奈川県横浜市瀬谷区二ツ橋町	カナガワケンヨコハマシセヤクフタツバシチョウ	045-123-4042	139.4963	35.46304	1574.0
16	S14012	本牧和田店	14	神奈川県	神奈川県横浜市中区本牧和田	カナガワケンヨコハマシナカクホンモクワダ	045-123-4034	139.6582	35.42156	1341.0
18	S14046	北山田店	14	神奈川県	神奈川県横浜市都筑区北山田一丁目	カナガワケンヨコハマシツヅキクキタヤマタイッチョウメ	045-123-4049	139.5916	35.56189	831.0
19	S14022	逗子店	14	神奈川県	神奈川県逗子市逗子一丁目	カナガワケンズシシズシイッチョウメ	046-123-4036	139.5789	35.29642	1838.0
20	S14011	日吉本町店	14	神奈川県	神奈川県横浜市港北区日吉本町四丁目	カナガワケンヨコハマシコウホククヒヨシホンチョウヨンチョウメ	045-123-4033	139.6316	35.54655	890.0

	customer_id	customer_name	gender_cd	gender	birth_day	age	postal_cd	address	application_store_cd	application_date	status_cd
1	CS037613000071	六角雅彦	9	不明	1952-04-01	66	136-0076	東京都江東区南砂**********	S13037	20150414	0-00000000-0
3	CS028811000001	堀井かおり	1	女性	1933-03-27	86	245-0016	神奈川県横浜市泉区和泉町**********	S14028	20160115	0-00000000-0
14	CS040412000191	川井郁恵	1	女性	1977-01-05	42	226-0021	神奈川県横浜市緑区北八朔町**********	S14040	20151101	1-20091025-4
31	CS028314000011	小菅あおい	1	女性	1983-11-26	35	246-0038	神奈川県横浜市瀬谷区宮沢**********	S14028	20151123	1-20080426-5
56	CS039212000051	藤島恵梨香	1	女性	1997-02-03	22	166-0001	東京都杉並区阿佐谷北**********	S13039	20171121	1-20100215-4
59	CS015412000111	松居奈月	1	女性	1972-10-04	46	136-0071	東京都江東区亀戸**********	S13015	20150629	0-00000000-0
63	CS004702000041	野島洋	0	男性	1943-08-24	75	176-0022	東京都練馬区向山**********	S13004	20170218	0-00000000-0
74	CS041515000001	栗田千夏	1	女性	1967-01-02	52	206-0001	東京都多摩市和田**********	S13041	20160422	E-20100803-F
85	CS029313000221	北条ひかり	1	女性	1987-06-19	31	279-0011	千葉県浦安市美浜**********	S12029	20180810	0-00000000-0
102	CS034312000071	望月奈央	1	女性	1980-09-20	38	213-0026	神奈川県川崎市高津区久末**********	S14034	20160106	0-00000000-0

	customer_id	customer_name	gender_cd	gender	birth_day	age	postal_cd	address	application_store_cd	application_date	status_cd
2	CS031415000172	宇多田貴美子	1	女性	1976-10-04	42	151-0053	東京都渋谷区代々木**********	S13031	20150529	D-20100325-C
6	CS015414000103	奥野陽子	1	女性	1977-08-09	41	136-0073	東京都江東区北砂**********	S13015	20150722	B-20100609-B
12	CS011215000048	芦田沙耶	1	女性	1992-02-01	27	223-0062	神奈川県横浜市港北区日吉本町**********	S14011	20150228	C-20100421-9
15	CS029415000023	梅田里穂	1	女性	1976-01-17	43	279-0043	千葉県浦安市富士見**********	S12029	20150610	D-20100918-E
21	CS035415000029	寺沢真希	9	不明	1977-09-27	41	158-0096	東京都世田谷区玉川台**********	S13035	20141220	F-20101029-F
32	CS031415000106	宇野由美子	1	女性	1970-02-26	49	151-0053	東京都渋谷区代々木**********	S13031	20150201	F-20100511-E
33	CS029215000025	石倉美帆	1	女性	1993-09-28	25	279-0022	千葉県浦安市今川**********	S12029	20150708	B-20100820-C
40	CS033605000005	猪股雄太	0	男性	1955-12-05	63	246-0031	神奈川県横浜市瀬谷区瀬谷**********	S14033	20150425	F-20100917-E
44	CS033415000229	板垣菜々美	1	女性	1977-11-07	41	246-0021	神奈川県横浜市瀬谷区二ツ橋町**********	S14033	20150712	F-20100326-E
53	CS008415000145	黒谷麻緒	1	女性	1977-06-27	41	157-0067	東京都世田谷区喜多見**********	S13008	20150829	F-20100622-F

	customer_id	customer_name	gender_cd	gender	birth_day	age	postal_cd	address	application_store_cd	application_date	status_cd
4	CS001215000145	田崎美紀	1	女性	1995-03-29	24	144-0055	東京都大田区仲六郷**********	S13001	20170605	6-20090929-2
9	CS033513000180	安斎遥	1	女性	1962-07-11	56	241-0823	神奈川県横浜市旭区善部町**********	S14033	20150728	6-20080506-5
12	CS011215000048	芦田沙耶	1	女性	1992-02-01	27	223-0062	神奈川県横浜市港北区日吉本町**********	S14011	20150228	C-20100421-9
14	CS040412000191	川井郁恵	1	女性	1977-01-05	42	226-0021	神奈川県横浜市緑区北八朔町**********	S14040	20151101	1-20091025-4
16	CS009315000023	皆川文世	1	女性	1980-04-15	38	154-0012	東京都世田谷区駒沢**********	S13009	20150319	5-20080322-1
22	CS015315000033	福士璃奈子	1	女性	1983-03-17	36	135-0043	東京都江東区塩浜**********	S13015	20141024	4-20080219-3
23	CS023513000066	神戸そら	1	女性	1961-12-17	57	210-0005	神奈川県川崎市川崎区東田町**********	S14023	20150915	5-20100524-9
24	CS035513000134	市川美帆	1	女性	1960-03-27	59	156-0053	東京都世田谷区桜**********	S13035	20150227	8-20100711-9
27	CS001515000263	高松夏空	1	女性	1962-11-09	56	144-0051	東京都大田区西蒲田**********	S13001	20160812	1-20100804-1
28	CS040314000027	鶴田きみまろ	9	不明	1986-03-26	33	226-0027	神奈川県横浜市緑区長津田**********	S14040	20150122	2-20080426-4

	store_cd	store_name	prefecture_cd	prefecture	address	address_kana	tel_no	longitude	latitude	floor_area
0	S12014	千草台店	12	千葉県	千葉県千葉市稲毛区千草台一丁目	チバケンチバシイナゲクチグサダイイッチョウメ	043-123-4003	140.1180	35.63559	1698.0
1	S13002	国分寺店	13	東京都	東京都国分寺市本多二丁目	トウキョウトコクブンジシホンダニチョウメ	042-123-4008	139.4802	35.70566	1735.0
2	S14010	菊名店	14	神奈川県	神奈川県横浜市港北区菊名一丁目	カナガワケンヨコハマシコウホククキクナイッチョウメ	045-123-4032	139.6326	35.50049	1732.0
3	S14033	阿久和店	14	神奈川県	神奈川県横浜市瀬谷区阿久和西一丁目	カナガワケンヨコハマシセヤクアクワニシイッチョウメ	045-123-4043	139.4961	35.45918	1495.0
4	S14036	相模原中央店	14	神奈川県	神奈川県相模原市中央二丁目	カナガワケンサガミハラシチュウオウニチョウメ	042-123-4045	139.3716	35.57327	1679.0
7	S14040	長津田店	14	神奈川県	神奈川県横浜市緑区長津田みなみ台五丁目	カナガワケンヨコハマシミドリクナガツタミナミダイゴチョウメ	045-123-4046	139.4994	35.52398	1548.0
9	S14050	阿久和西店	14	神奈川県	神奈川県横浜市瀬谷区阿久和西一丁目	カナガワケンヨコハマシセヤクアクワニシイッチョウメ	045-123-4053	139.4961	35.45918	1830.0
11	S13052	森野店	13	東京都	東京都町田市森野三丁目	トウキョウトマチダシモリノサンチョウメ	042-123-4030	139.4383	35.55293	1087.0
12	S14028	二ツ橋店	14	神奈川県	神奈川県横浜市瀬谷区二ツ橋町	カナガワケンヨコハマシセヤクフタツバシチョウ	045-123-4042	139.4963	35.46304	1574.0
16	S14012	本牧和田店	14	神奈川県	神奈川県横浜市中区本牧和田	カナガワケンヨコハマシナカクホンモクワダ	045-123-4034	139.6582	35.42156	1341.0
18	S14046	北山田店	14	神奈川県	神奈川県横浜市都筑区北山田一丁目	カナガワケンヨコハマシツヅキクキタヤマタイッチョウメ	045-123-4049	139.5916	35.56189	831.0
19	S14022	逗子店	14	神奈川県	神奈川県逗子市逗子一丁目	カナガワケンズシシズシイッチョウメ	046-123-4036	139.5789	35.29642	1838.0
20	S14011	日吉本町店	14	神奈川県	神奈川県横浜市港北区日吉本町四丁目	カナガワケンヨコハマシコウホククヒヨシホンチョウヨンチョウメ	045-123-4033	139.6316	35.54655	890.0
21	S13016	小金井店	13	東京都	東京都小金井市本町一丁目	トウキョウトコガネイシホンチョウイッチョウメ	042-123-4015	139.5094	35.70018	1399.0
22	S14034	川崎野川店	14	神奈川県	神奈川県川崎市宮前区野川	カナガワケンカワサキシミヤマエクノガワ	044-123-4044	139.5998	35.57693	1318.0
26	S14048	中川中央店	14	神奈川県	神奈川県横浜市都筑区中川中央二丁目	カナガワケンヨコハマシツヅキクナカガワチュウオウニチョウメ	045-123-4051	139.5758	35.54912	1657.0
27	S12007	佐倉店	12	千葉県	千葉県佐倉市上志津	チバケンサクラシカミシヅ	043-123-4001	140.1452	35.71872	1895.0
28	S14026	辻堂西海岸店	14	神奈川県	神奈川県藤沢市辻堂西海岸二丁目	カナガワケンフジサワシツジドウニシカイガンニチョウメ	046-123-4040	139.4466	35.32464	1732.0
29	S13041	八王子店	13	東京都	東京都八王子市大塚	トウキョウトハチオウジシオオツカ	042-123-4026	139.4235	35.63787	810.0
31	S14049	川崎大師店	14	神奈川県	神奈川県川崎市川崎区中瀬三丁目	カナガワケンカワサキシカワサキクナカゼサンチョウメ	044-123-4052	139.7327	35.53759	962.0
32	S14023	川崎店	14	神奈川県	神奈川県川崎市川崎区本町二丁目	カナガワケンカワサキシカワサキクホンチョウニチョウメ	044-123-4037	139.7028	35.53599	1804.0
33	S13018	清瀬店	13	東京都	東京都清瀬市松山一丁目	トウキョウトキヨセシマツヤマイッチョウメ	042-123-4017	139.5178	35.76885	1220.0
35	S14027	南藤沢店	14	神奈川県	神奈川県藤沢市南藤沢	カナガワケンフジサワシミナミフジサワ	046-123-4041	139.4896	35.33762	1521.0
36	S14021	伊勢原店	14	神奈川県	神奈川県伊勢原市伊勢原四丁目	カナガワケンイセハラシイセハラヨンチョウメ	046-123-4035	139.3129	35.40169	962.0
37	S14047	相模原店	14	神奈川県	神奈川県相模原市千代田六丁目	カナガワケンサガミハラシチヨダロクチョウメ	042-123-4050	139.3748	35.55959	1047.0
38	S12013	習志野店	12	千葉県	千葉県習志野市芝園一丁目	チバケンナラシノシシバゾノイッチョウメ	047-123-4002	140.0220	35.66122	808.0
40	S14042	新山下店	14	神奈川県	神奈川県横浜市中区新山下二丁目	カナガワケンヨコハマシナカクシンヤマシタニチョウメ	045-123-4047	139.6593	35.43894	1044.0
42	S12030	八幡店	12	千葉県	千葉県市川市八幡三丁目	チバケンイチカワシヤワタサンチョウメ	047-123-4005	139.9240	35.72318	1162.0
44	S14025	大和店	14	神奈川県	神奈川県大和市下和田	カナガワケンヤマトシシモワダ	046-123-4039	139.4680	35.43414	1011.0
45	S14045	厚木店	14	神奈川県	神奈川県厚木市中町二丁目	カナガワケンアツギシナカチョウニチョウメ	046-123-4048	139.3651	35.44182	980.0
47	S12029	東野店	12	千葉県	千葉県浦安市東野一丁目	チバケンウラヤスシヒガシノイッチョウメ	047-123-4004	139.8968	35.65086	1101.0
49	S12053	高洲店	12	千葉県	千葉県浦安市高洲五丁目	チバケンウラヤスシタカスゴチョウメ	047-123-4006	139.9176	35.63755	1555.0
51	S14024	三田店	14	神奈川県	神奈川県川崎市多摩区三田四丁目	カナガワケンカワサキシタマクミタヨンチョウメ	044-123-4038	139.5424	35.60770	972.0
52	S14006	葛が谷店	14	神奈川県	神奈川県横浜市都筑区葛が谷	カナガワケンヨコハマシツヅキククズガヤ	045-123-4031	139.5633	35.53573	1886.0

	customer_id	customer_name	gender_cd	gender	birth_day	age	postal_cd	address	application_store_cd	application_date	status_cd
18817	CS003813000014	村山菜々美	1	女性	1928-11-26	90	182-0007	東京都調布市菊野台**********	S13003	20160214	0-00000000-0
12328	CS026813000004	吉村朝陽	1	女性	1928-12-14	90	251-0043	神奈川県藤沢市辻堂元町**********	S14026	20150723	0-00000000-0
15682	CS018811000003	熊沢美里	1	女性	1929-01-07	90	204-0004	東京都清瀬市野塩**********	S13018	20150403	0-00000000-0
15302	CS027803000004	内村拓郎	0	男性	1929-01-12	90	251-0031	神奈川県藤沢市鵠沼藤が谷**********	S14027	20151227	0-00000000-0
1681	CS013801000003	天野拓郎	0	男性	1929-01-15	90	274-0824	千葉県船橋市前原東**********	S12013	20160120	0-00000000-0
7511	CS001814000022	鶴田里穂	1	女性	1929-01-28	90	144-0045	東京都大田区南六郷**********	S13001	20161012	A-20090415-7
2378	CS016815000002	山元美紀	1	女性	1929-02-22	90	184-0005	東京都小金井市桜町**********	S13016	20150629	C-20090923-C
4680	CS009815000003	中田里穂	1	女性	1929-04-08	89	154-0014	東京都世田谷区新町**********	S13009	20150421	D-20091021-E
16070	CS005813000015	金谷恵梨香	1	女性	1929-04-09	89	165-0032	東京都中野区鷺宮**********	S13005	20150506	0-00000000-0
6305	CS012813000013	宇野南朋	1	女性	1929-04-09	89	231-0806	神奈川県横浜市中区本牧町**********	S14012	20150712	0-00000000-0

	customer_id	customer_name	gender_cd	gender	birth_day	age	postal_cd	address	application_store_cd	application_date	status_cd
15639	CS035114000004	大村美里	1	女性	2007-11-25	11	156-0053	東京都世田谷区桜**********	S13035	20150619	6-20091205-6
7468	CS022103000002	福山はじめ	9	不明	2007-10-02	11	249-0006	神奈川県逗子市逗子**********	S14022	20160909	0-00000000-0
10745	CS002113000009	柴田真悠子	1	女性	2007-09-17	11	184-0014	東京都小金井市貫井南町**********	S13002	20160304	0-00000000-0
19811	CS004115000014	松井京子	1	女性	2007-08-09	11	165-0031	東京都中野区上鷺宮**********	S13004	20161120	1-20081231-1
7039	CS002114000010	山内遥	1	女性	2007-06-03	11	184-0015	東京都小金井市貫井北町**********	S13002	20160920	6-20100510-1
3670	CS025115000002	小柳夏希	1	女性	2007-04-18	11	245-0018	神奈川県横浜市泉区上飯田町**********	S14025	20160116	D-20100913-D
12493	CS002113000025	広末まなみ	1	女性	2007-03-30	12	184-0015	東京都小金井市貫井北町**********	S13002	20171030	0-00000000-0
15977	CS033112000003	長野美紀	1	女性	2007-03-22	12	245-0051	神奈川県横浜市戸塚区名瀬町**********	S14033	20150606	0-00000000-0
5716	CS007115000006	福岡瞬	1	女性	2007-03-10	12	285-0845	千葉県佐倉市西志津**********	S12007	20151118	F-20101016-F
15097	CS014113000008	矢口莉緒	1	女性	2007-03-05	12	260-0041	千葉県千葉市中央区東千葉**********	S12014	20150622	3-20091108-6

	customer_id	amount	ranking
1202	CS011415000006	10925	1.0
62317	ZZ000000000000	6800	2.0
54095	CS028605000002	5780	3.0
4632	CS015515000034	5480	4.0
72747	ZZ000000000000	5480	4.0
10320	ZZ000000000000	5480	4.0
97294	CS021515000089	5440	7.0
28304	ZZ000000000000	5440	7.0
92246	CS009415000038	5280	9.0
68553	CS040415000200	5280	9.0

	store_cd	amount	quantity
0	S12007	638761	2099
1	S12013	787513	2425
2	S12014	725167	2358
3	S12029	794741	2555
4	S12030	684402	2403
5	S13001	811936	2347
6	S13002	727821	2340
7	S13003	764294	2197
8	S13004	779373	2390
9	S13005	629876	2004
10	S13008	809288	2491
11	S13009	808870	2486
12	S13015	780873	2248
13	S13016	793773	2432
14	S13017	748221	2376
15	S13018	790535	2562
16	S13019	827833	2541
17	S13020	796383	2383
18	S13031	705968	2336
19	S13032	790501	2491
20	S13035	715869	2219
21	S13037	693087	2344
22	S13038	708884	2337
23	S13039	611888	1981
24	S13041	728266	2233
25	S13043	587895	1881
26	S13044	520764	1729
27	S13051	107452	354
28	S13052	100314	250
29	S14006	712839	2284
30	S14010	790361	2290
31	S14011	805724	2434
32	S14012	720600	2412
33	S14021	699511	2231
34	S14022	651328	2047
35	S14023	727630	2258
36	S14024	736323	2417
37	S14025	755581	2394
38	S14026	824537	2503
39	S14027	714550	2303
40	S14028	786145	2458
41	S14033	725318	2282
42	S14034	653681	2024
43	S14036	203694	635
44	S14040	701858	2233
45	S14042	534689	1935
46	S14045	458484	1398
47	S14046	412646	1354
48	S14047	338329	1041
49	S14048	234276	769
50	S14049	230808	788
51	S14050	167090	580

	customer_id	customer_name	gender_cd	gender	birth_day	age	postal_cd	address	application_store_cd	application_date	status_cd
12	CS011215000048	芦田沙耶	1	女性	1992-02-01	27	223-0062	神奈川県横浜市港北区日吉本町**********	S14011	20150228	C-20100421-9
68	CS022513000105	島村貴美子	1	女性	1962-03-12	57	249-0002	神奈川県逗子市山の根**********	S14022	20150320	A-20091115-7
71	CS001515000096	水野陽子	9	不明	1960-11-29	58	144-0053	東京都大田区蒲田本町**********	S13001	20150614	A-20100724-7
122	CS013615000053	西脇季衣	1	女性	1953-10-18	65	261-0026	千葉県千葉市美浜区幕張西**********	S12013	20150128	B-20100329-6
144	CS020412000161	小宮薫	1	女性	1974-05-21	44	174-0042	東京都板橋区東坂下**********	S13020	20150822	B-20081021-3
178	CS001215000097	竹中あさみ	1	女性	1990-07-25	28	146-0095	東京都大田区多摩川**********	S13001	20170315	A-20100211-2
252	CS035212000007	内村恵梨香	1	女性	1990-12-04	28	152-0023	東京都目黒区八雲**********	S13035	20151013	B-20101018-6
259	CS002515000386	野田コウ	1	女性	1963-05-30	55	185-0013	東京都国分寺市西恋ケ窪**********	S13002	20160410	C-20100127-8
293	CS001615000372	稲垣寿々花	1	女性	1956-10-29	62	144-0035	東京都大田区南蒲田**********	S13001	20170403	A-20100104-1
297	CS032512000121	松井知世	1	女性	1962-09-04	56	210-0011	神奈川県川崎市川崎区富士見**********	S13032	20150727	A-20100103-5

	customer_id	sales_ymd
0	CS001113000004	20190308
1	CS001114000005	20190731
2	CS001115000010	20190405
3	CS001205000004	20190625
4	CS001205000006	20190224
5	CS001211000025	20190322
6	CS001212000027	20170127
7	CS001212000031	20180906
8	CS001212000046	20170811
9	CS001212000070	20191018

	customer_id	sales_ymd
0	CS001113000004	20190308
1	CS001114000005	20180503
2	CS001115000010	20171228
3	CS001205000004	20170914
4	CS001205000006	20180207
5	CS001211000025	20190322
6	CS001212000027	20170127
7	CS001212000031	20180906
8	CS001212000046	20170811
9	CS001212000070	20191018

	customer_id_	sales_ymd_max	sales_ymd_min
1	CS001114000005	20190731	20180503
2	CS001115000010	20190405	20171228
3	CS001205000004	20190625	20170914
4	CS001205000006	20190224	20180207
13	CS001214000009	20190902	20170306
14	CS001214000017	20191006	20180828
16	CS001214000048	20190929	20171109
17	CS001214000052	20190617	20180208
20	CS001215000005	20181021	20170206
21	CS001215000040	20171022	20170214

	store_cd	amount
28	S13052	402.867470
12	S13015	351.111960
7	S13003	350.915519
30	S14010	348.791262
5	S13001	348.470386

	store_cd	amount
28	S13052	190.0
30	S14010	188.0
51	S14050	185.0
44	S14040	180.0
7	S13003	180.0

	store_cd	product_cd
0	S12007	P060303001
1	S12013	P060303001
2	S12014	P060303001
3	S12029	P060303001
4	S12030	P060303001
5	S13001	P060303001
6	S13002	P060303001
7	S13003	P071401001
8	S13004	P060303001
9	S13005	P040503001

	store_cd	amount
28	S13052	440088.701311
31	S14011	306314.558164
42	S14034	296920.081011
5	S13001	295431.993329
12	S13015	295294.361116

	store_cd	amount
28	S13052	663.391816
31	S14011	553.456916
42	S14034	544.903736
5	S13001	543.536561
12	S13015	543.409938

	store_cd	amount
1	S12013	330.194130
5	S13001	348.470386
7	S13003	350.915519
8	S13004	330.943949
12	S13015	351.111960
16	S13019	330.208616
17	S13020	337.879932
28	S13052	402.867470
30	S14010	348.791262
31	S14011	335.718333
38	S14026	332.340588
46	S14045	330.082073
48	S14047	330.077073

	customer_id	amount
2	CS001115000010	3044
4	CS001205000006	3337
13	CS001214000009	4685
14	CS001214000017	4132
17	CS001214000052	5639
21	CS001215000040	3496
30	CS001304000006	3726
32	CS001305000005	3485
33	CS001305000011	4370
53	CS001315000180	3300

	product_cd	category_major_cd	category_medium_cd	category_small_cd	unit_price	unit_cost	category_small_name
0	P040101001	04	0401	040101	198.0	149.0	弁当類
1	P040101002	04	0401	040101	218.0	164.0	弁当類
2	P040101003	04	0401	040101	230.0	173.0	弁当類
3	P040101004	04	0401	040101	248.0	186.0	弁当類
4	P040101005	04	0401	040101	268.0	201.0	弁当類
5	P040101006	04	0401	040101	298.0	224.0	弁当類
6	P040101007	04	0401	040101	338.0	254.0	弁当類
7	P040101008	04	0401	040101	420.0	315.0	弁当類
8	P040101009	04	0401	040101	498.0	374.0	弁当類
9	P040101010	04	0401	040101	580.0	435.0	弁当類

	customer_id	amount
0	CS021313000114	0.0
1	CS031415000172	5088.0
2	CS028811000001	0.0
3	CS001215000145	875.0
4	CS015414000103	3122.0
5	CS033513000180	868.0
6	CS035614000014	0.0
7	CS011215000048	3444.0
8	CS009413000079	0.0
9	CS040412000191	210.0

	customer_id	sales_ymd	amount
0	CS040214000008	23.0	NaN
1	CS015415000185	22.0	20153.0
2	CS010214000010	22.0	18585.0
3	CS010214000002	21.0	NaN
4	CS028415000007	21.0	19127.0
5	CS017415000097	20.0	23086.0
6	CS016415000141	20.0	18372.0
7	CS031414000051	19.0	19202.0
8	CS014214000023	19.0	NaN
9	CS022515000226	19.0	NaN
10	CS021515000172	19.0	NaN
11	CS039414000052	19.0	NaN
12	CS021514000045	19.0	NaN
13	CS022515000028	18.0	NaN
14	CS030214000008	18.0	NaN
15	CS021515000056	18.0	NaN
16	CS014415000077	18.0	NaN
17	CS021515000211	18.0	NaN
18	CS032415000209	18.0	NaN
19	CS031414000073	18.0	NaN
20	CS001605000009	NaN	18925.0
21	CS006515000023	NaN	18372.0
22	CS011414000106	NaN	18338.0
23	CS038415000104	NaN	17847.0
24	CS035414000024	NaN	17615.0
25	CS021515000089	NaN	17580.0
26	CS032414000072	NaN	16563.0
27	CS016415000101	NaN	16348.0
28	CS011415000006	NaN	16094.0
29	CS034415000047	NaN	16083.0
30	CS007514000094	NaN	15735.0
31	CS009414000059	NaN	15492.0
32	CS030415000034	NaN	15468.0
33	CS015515000034	NaN	15300.0

	sales_ymd	amount	lag_ymd	lag_amount	diff_amount
0	20170101	33723	NaN	NaN	NaN
1	20170102	24165	20170101.0	33723.0	-9558.0
2	20170103	27503	20170102.0	24165.0	3338.0
3	20170104	36165	20170103.0	27503.0	8662.0
4	20170105	37830	20170104.0	36165.0	1665.0
5	20170106	32387	20170105.0	37830.0	-5443.0
6	20170107	23415	20170106.0	32387.0	-8972.0
7	20170108	24737	20170107.0	23415.0	1322.0
8	20170109	26718	20170108.0	24737.0	1981.0
9	20170110	20143	20170109.0	26718.0	-6575.0

	era	male	female	unknown
0	10	1591.0	149836.0	4317.0
1	20	72940.0	1363724.0	44328.0
2	30	177322.0	693047.0	50441.0
3	40	19355.0	9320791.0	483512.0
4	50	54320.0	6685192.0	342923.0
5	60	272469.0	987741.0	71418.0
6	70	13435.0	29764.0	2427.0
7	80	46360.0	262923.0	5111.0
8	90	NaN	6260.0	NaN

	customer_id	birth_day
0	CS021313000114	19810429
1	CS037613000071	19520401
2	CS031415000172	19761004
3	CS028811000001	19330327
4	CS001215000145	19950329
5	CS020401000016	19740915
6	CS015414000103	19770809
7	CS029403000008	19730817
8	CS015804000004	19310502
9	CS033513000180	19620711

	customer_id	application_date
0	CS021313000114	2015-09-05
1	CS037613000071	2015-04-14
2	CS031415000172	2015-05-29
3	CS028811000001	2016-01-15
4	CS001215000145	2017-06-05
5	CS020401000016	2015-02-25
6	CS015414000103	2015-07-22
7	CS029403000008	2015-05-15
8	CS015804000004	2015-06-07
9	CS033513000180	2015-07-28

	receipt_no	receipt_sub_no	sales_ymd
0	112	1	2018-11-03
1	1132	2	2018-11-18
2	1102	1	2017-07-12
3	1132	1	2019-02-05
4	1102	2	2018-08-21
5	1112	1	2019-06-05
6	1102	2	2018-12-05
7	1102	1	2019-09-22
8	1112	2	2017-05-04
9	1102	1	2019-10-10

	receipt_no	receipt_sub_no	sales_year
0	112	1	2018
1	1132	2	2018
2	1102	1	2017
3	1132	1	2019
4	1102	2	2018
5	1112	1	2019
6	1102	2	2018
7	1102	1	2019
8	1112	2	2017
9	1102	1	2019

	customer_id	amount	sales_flg
0	CS001113000004	1298	0
1	CS001114000005	626	0
2	CS001115000010	3044	1
3	CS001205000004	1988	0
4	CS001205000006	3337	1
5	CS001211000025	456	0
6	CS001212000027	448	0
7	CS001212000031	296	0
8	CS001212000046	228	0
9	CS001212000070	456	0

	customer_id	address	prefecture_cd
0	CS021313000114	神奈川県伊勢原市粟窪**********	14
1	CS037613000071	東京都江東区南砂**********	13
2	CS031415000172	東京都渋谷区代々木**********	13
3	CS028811000001	神奈川県横浜市泉区和泉町**********	14
4	CS001215000145	東京都大田区仲六郷**********	13
5	CS020401000016	東京都板橋区若木**********	13
6	CS015414000103	東京都江東区北砂**********	13
7	CS029403000008	千葉県浦安市海楽**********	12
8	CS015804000004	東京都江東区北砂**********	13
9	CS033513000180	神奈川県横浜市旭区善部町**********	14

	customer_id	amount	quantile	pct_group
0	CS001113000004	1298	[548.5, 1478.0)	2
1	CS001114000005	626	[548.5, 1478.0)	2
2	CS001115000010	3044	[1478.0, 3651.0)	3
3	CS001205000004	1988	[1478.0, 3651.0)	3
4	CS001205000006	3337	[1478.0, 3651.0)	3
5	CS001211000025	456	[0.0, 548.5)	1
6	CS001212000027	448	[0.0, 548.5)	1
7	CS001212000031	296	[0.0, 548.5)	1
8	CS001212000046	228	[0.0, 548.5)	1
9	CS001212000070	456	[0.0, 548.5)	1

	customer_id	amount	quantile	pct_group
0	CS001113000004	1298	(548.5, 1478.0]	2
1	CS001114000005	626	(548.5, 1478.0]	2
2	CS001115000010	3044	(1478.0, 3651.0]	3
3	CS001205000004	1988	(1478.0, 3651.0]	3
4	CS001205000006	3337	(1478.0, 3651.0]	3
5	CS001211000025	456	(69.999, 548.5]	1
6	CS001212000027	448	(69.999, 548.5]	1
7	CS001212000031	296	(69.999, 548.5]	1
8	CS001212000046	228	(69.999, 548.5]	1
9	CS001212000070	456	(69.999, 548.5]	1