import pathlib
import pandas as pd
import numpy as np


p = pathlib.Path('/Users/mbp441/Desktop/github/PYTHON/jupyter_notebook/sample_time_series.csv')


df = pd.read_csv(p)
display(df.head())
print(df.dtypes)
print(f'\n行数：{len(df)}')

Date     object
count     int64
dtype: object

行数：618


# 読み込みと同時に変換する
df0 = pd.read_csv(p, parse_dates=['Date']).head()
display(df0.head())
print(df0.dtypes)

Date     datetime64[ns]
count             int64
dtype: object


# 新しく列を作成（推奨）
df1 = df.copy()
df1['to_datetime'] = pd.to_datetime(df1['Date'])
display(df1.head())
df1.dtypes

Date                   object
count                   int64
to_datetime    datetime64[ns]
dtype: object


df1['to_datetime'].dt.year.head()

0    2020
1    2020
2    2020
3    2020
4    2020
Name: to_datetime, dtype: int64


df1['to_datetime'].dt.month.head()

0    5
1    5
2    5
3    5
4    5
Name: to_datetime, dtype: int64


# 曜日を取得する。day_name()、dayofweekは、括弧は不要
display(
    df1['to_datetime'].dt.day_name().head(),
    df1['to_datetime'].dt.dayofweek.head()
)

0     Saturday
1       Sunday
2       Monday
3      Tuesday
4    Wednesday
Name: to_datetime, dtype: object

0    5
1    6
2    0
3    1
4    2
Name: to_datetime, dtype: int64


# 四半期
df1['to_datetime'].dt.quarter.head()

0    2
1    2
2    2
3    2
4    2
Name: to_datetime, dtype: int64


# dt.strftimeでフォーマットを指定する（新規で列追加）
df1['to_datetime'].dt.strftime('%Y%m').head()

0    202005
1    202005
2    202005
3    202005
4    202005
Name: to_datetime, dtype: object


df1[df1['to_datetime'].dt.dayofweek == 0].head()


df1[df1['to_datetime'].dt.month == 8].head()


df2 = df1.copy()
df2['DateTimeIndex'] = pd.to_datetime(df2['Date'])
df2.set_index('DateTimeIndex', inplace=True)
display(df2.head())
df2.dtypes

Date                   object
count                   int64
to_datetime    datetime64[ns]
dtype: object


# 指定してスライス
df2.loc['2020-05-12']

Date                     2020/5/12
count                          153
to_datetime    2020-05-12 00:00:00
Name: 2020-05-12 00:00:00, dtype: object


# 2020−5月の行数は？
df2.loc['2020-5'].shape
# df.loc['2020-5']だけだとエラーになる

(23, 3)


# 範囲をしてスライスが可能
df2.sort_index().loc['2020-8-1':'2020-8-5']


# 8月だけも抽出可能
month8 = df2.loc['2020-08':'2020-08']
month8.head(2).append(month8.tail(2))


# カウントする場合は size() を利用。sum()だと集計になる（次）、オフセットは以下の集計と同様
df2.resample('MS').size().head()

DateTimeIndex
2020-05-01    23
2020-06-01    30
2020-07-01    31
2020-08-01    31
2020-09-01    30
Freq: MS, dtype: int64


# 週ごとにカウント（オフセットは W:デフォルトの週末は日曜）
df_w = df2.resample('W').sum()
display(df_w.head())
print(type(df_w))

<class 'pandas.core.frame.DataFrame'>


# 月ごとにカウント（オフセットは M or MS）
df2.resample('MS').sum().head()


# 年ごとにカウント（オフセットは A or AS）
df2.resample('AS').sum().head()


# Qごとにカウント（オフセットは Q or QS）
df2.resample('QS').sum().head()


df3 = df1.copy()
df3.head()


# 参考までにMSで集計する。オフセットは前述のとおり。
df3.resample('MS', on='to_datetime').sum().head()


# DateTimeIndexがセットされている場合（df2を利用）
df2.groupby(pd.Grouper(freq='MS')).sum().head()


# DateTimeIndexを使用しない場合（df3を利用）（'to_datetime'はここでは列名を指す）
df3.groupby(pd.Grouper(key='to_datetime', freq='MS')).sum().head()

時系列データのスライスや集計方法¶

サンプル作成¶