import pathlib
import pandas as pd
import numpy as np
p = pathlib.Path('/Users/mbp441/Desktop/github/PYTHON/jupyter_notebook/sample_time_series.csv')
df = pd.read_csv(p)
display(df.head())
print(df.dtypes)
print(f'\n行数:{len(df)}')
| Date | count | |
|---|---|---|
| 0 | 2020/5/9 | 178 |
| 1 | 2020/5/10 | 162 |
| 2 | 2020/5/11 | 162 |
| 3 | 2020/5/12 | 153 |
| 4 | 2020/5/13 | 137 |
Date object count int64 dtype: object 行数:618
parse_dates=[-]でdatetime64型で取り込む¶# 読み込みと同時に変換する
df0 = pd.read_csv(p, parse_dates=['Date']).head()
display(df0.head())
print(df0.dtypes)
| Date | count | |
|---|---|---|
| 0 | 2020-05-09 | 178 |
| 1 | 2020-05-10 | 162 |
| 2 | 2020-05-11 | 162 |
| 3 | 2020-05-12 | 153 |
| 4 | 2020-05-13 | 137 |
Date datetime64[ns] count int64 dtype: object
to_datetime でdatetime64[ns]型へ変換¶# 新しく列を作成(推奨)
df1 = df.copy()
df1['to_datetime'] = pd.to_datetime(df1['Date'])
display(df1.head())
df1.dtypes
| Date | count | to_datetime | |
|---|---|---|---|
| 0 | 2020/5/9 | 178 | 2020-05-09 |
| 1 | 2020/5/10 | 162 | 2020-05-10 |
| 2 | 2020/5/11 | 162 | 2020-05-11 |
| 3 | 2020/5/12 | 153 | 2020-05-12 |
| 4 | 2020/5/13 | 137 | 2020-05-13 |
Date object count int64 to_datetime datetime64[ns] dtype: object
df1['to_datetime'].dt.year.head()
0 2020 1 2020 2 2020 3 2020 4 2020 Name: to_datetime, dtype: int64
df1['to_datetime'].dt.month.head()
0 5 1 5 2 5 3 5 4 5 Name: to_datetime, dtype: int64
# 曜日を取得する。day_name()、dayofweekは、括弧は不要
display(
df1['to_datetime'].dt.day_name().head(),
df1['to_datetime'].dt.dayofweek.head()
)
0 Saturday 1 Sunday 2 Monday 3 Tuesday 4 Wednesday Name: to_datetime, dtype: object
0 5 1 6 2 0 3 1 4 2 Name: to_datetime, dtype: int64
# 四半期
df1['to_datetime'].dt.quarter.head()
0 2 1 2 2 2 3 2 4 2 Name: to_datetime, dtype: int64
# dt.strftimeでフォーマットを指定する(新規で列追加)
df1['to_datetime'].dt.strftime('%Y%m').head()
0 202005 1 202005 2 202005 3 202005 4 202005 Name: to_datetime, dtype: object
df1[df1['to_datetime'].dt.dayofweek == 0].head()
| Date | count | to_datetime | |
|---|---|---|---|
| 2 | 2020/5/11 | 162 | 2020-05-11 |
| 9 | 2020/5/18 | 83 | 2020-05-18 |
| 16 | 2020/5/25 | 45 | 2020-05-25 |
| 23 | 2020/6/1 | 107 | 2020-06-01 |
| 30 | 2020/6/8 | 133 | 2020-06-08 |
df1[df1['to_datetime'].dt.month == 8].head()
| Date | count | to_datetime | |
|---|---|---|---|
| 84 | 2020/8/1 | 791 | 2020-08-01 |
| 85 | 2020/8/2 | 878 | 2020-08-02 |
| 86 | 2020/8/3 | 993 | 2020-08-03 |
| 87 | 2020/8/4 | 1033 | 2020-08-04 |
| 88 | 2020/8/5 | 1014 | 2020-08-05 |
datetime64[ns]型)¶df2 = df1.copy()
df2['DateTimeIndex'] = pd.to_datetime(df2['Date'])
df2.set_index('DateTimeIndex', inplace=True)
display(df2.head())
df2.dtypes
| Date | count | to_datetime | |
|---|---|---|---|
| DateTimeIndex | |||
| 2020-05-09 | 2020/5/9 | 178 | 2020-05-09 |
| 2020-05-10 | 2020/5/10 | 162 | 2020-05-10 |
| 2020-05-11 | 2020/5/11 | 162 | 2020-05-11 |
| 2020-05-12 | 2020/5/12 | 153 | 2020-05-12 |
| 2020-05-13 | 2020/5/13 | 137 | 2020-05-13 |
Date object count int64 to_datetime datetime64[ns] dtype: object
# 指定してスライス
df2.loc['2020-05-12']
Date 2020/5/12 count 153 to_datetime 2020-05-12 00:00:00 Name: 2020-05-12 00:00:00, dtype: object
# 2020−5月の行数は?
df2.loc['2020-5'].shape
# df.loc['2020-5']だけだとエラーになる
(23, 3)
# 範囲をしてスライスが可能
df2.sort_index().loc['2020-8-1':'2020-8-5']
| Date | count | to_datetime | |
|---|---|---|---|
| DateTimeIndex | |||
| 2020-08-01 | 2020/8/1 | 791 | 2020-08-01 |
| 2020-08-02 | 2020/8/2 | 878 | 2020-08-02 |
| 2020-08-03 | 2020/8/3 | 993 | 2020-08-03 |
| 2020-08-04 | 2020/8/4 | 1033 | 2020-08-04 |
| 2020-08-05 | 2020/8/5 | 1014 | 2020-08-05 |
# 8月だけも抽出可能
month8 = df2.loc['2020-08':'2020-08']
month8.head(2).append(month8.tail(2))
| Date | count | to_datetime | |
|---|---|---|---|
| DateTimeIndex | |||
| 2020-08-01 | 2020/8/1 | 791 | 2020-08-01 |
| 2020-08-02 | 2020/8/2 | 878 | 2020-08-02 |
| 2020-08-30 | 2020/8/30 | 870 | 2020-08-30 |
| 2020-08-31 | 2020/8/31 | 904 | 2020-08-31 |
df.resample(offset)¶# カウントする場合は size() を利用。sum()だと集計になる(次)、オフセットは以下の集計と同様
df2.resample('MS').size().head()
DateTimeIndex 2020-05-01 23 2020-06-01 30 2020-07-01 31 2020-08-01 31 2020-09-01 30 Freq: MS, dtype: int64
# 週ごとにカウント(オフセットは W:デフォルトの週末は日曜)
df_w = df2.resample('W').sum()
display(df_w.head())
print(type(df_w))
| count | |
|---|---|
| DateTimeIndex | |
| 2020-05-10 | 340 |
| 2020-05-17 | 866 |
| 2020-05-24 | 418 |
| 2020-05-31 | 426 |
| 2020-06-07 | 872 |
<class 'pandas.core.frame.DataFrame'>
# 月ごとにカウント(オフセットは M or MS)
df2.resample('MS').sum().head()
| count | |
|---|---|
| DateTimeIndex | |
| 2020-05-01 | 2050 |
| 2020-06-01 | 2059 |
| 2020-07-01 | 5490 |
| 2020-08-01 | 31258 |
| 2020-09-01 | 12246 |
# 年ごとにカウント(オフセットは A or AS)
df2.resample('AS').sum().head()
| count | |
|---|---|
| DateTimeIndex | |
| 2020-01-01 | 84467 |
| 2021-01-01 | 832241 |
| 2022-01-01 | 14380 |
# Qごとにカウント(オフセットは Q or QS)
df2.resample('QS').sum().head()
| count | |
|---|---|
| DateTimeIndex | |
| 2020-04-01 | 4109 |
| 2020-07-01 | 48994 |
| 2020-10-01 | 31364 |
| 2021-01-01 | 139712 |
| 2021-04-01 | 222485 |
df3 = df1.copy()
df3.head()
| Date | count | to_datetime | |
|---|---|---|---|
| 0 | 2020/5/9 | 178 | 2020-05-09 |
| 1 | 2020/5/10 | 162 | 2020-05-10 |
| 2 | 2020/5/11 | 162 | 2020-05-11 |
| 3 | 2020/5/12 | 153 | 2020-05-12 |
| 4 | 2020/5/13 | 137 | 2020-05-13 |
# 参考までにMSで集計する。オフセットは前述のとおり。
df3.resample('MS', on='to_datetime').sum().head()
| count | |
|---|---|
| to_datetime | |
| 2020-05-01 | 2050 |
| 2020-06-01 | 2059 |
| 2020-07-01 | 5490 |
| 2020-08-01 | 31258 |
| 2020-09-01 | 12246 |
df.groupby(pd.Grouper(-))を利用する場合¶# DateTimeIndexがセットされている場合(df2を利用)
df2.groupby(pd.Grouper(freq='MS')).sum().head()
| count | |
|---|---|
| DateTimeIndex | |
| 2020-05-01 | 2050 |
| 2020-06-01 | 2059 |
| 2020-07-01 | 5490 |
| 2020-08-01 | 31258 |
| 2020-09-01 | 12246 |
# DateTimeIndexを使用しない場合(df3を利用)('to_datetime'はここでは列名を指す)
df3.groupby(pd.Grouper(key='to_datetime', freq='MS')).sum().head()
| count | |
|---|---|
| to_datetime | |
| 2020-05-01 | 2050 |
| 2020-06-01 | 2059 |
| 2020-07-01 | 5490 |
| 2020-08-01 | 31258 |
| 2020-09-01 | 12246 |