import pandas as pd
import pathlib
import numpy as np
p = pathlib.Path('/Users/mbp441/Desktop/github/PYTHON/jupyter_notebook/cookbook_data')
date, time, datetime の3つのデータ型を提供date :年月日time :時分秒マイクロ秒(日付ではない)、例)12時間30分datetime :日付と時間を合わせたものTimestampオブジェクト :精度はナノ秒、Numpyのdatetimeデータ型を継承timedeltaオブジェクトがあり、日付時間の加減計算で利用される(1)まずdatetimeモジュールをインポートして、date,time,datetimeオブジェクトをつくる
import datetime
date = datetime.date(year=2022, month=1, day=6)
time = datetime.time(hour=7, minute=0, second=0, microsecond=463198)
dt = datetime.datetime(year=2022, month=1, day=6,
hour=7, minute=0, second=0, microsecond=463198)
print('date is', date)
print('time is', time)
print('datetime is', dt)
date is 2022-01-06 time is 07:00:00.463198 datetime is 2022-01-06 07:00:00.463198
(2)datetimeモジュールの別のデータ型、timedeltaオブジェクトを作って出力する
td = datetime.timedelta(weeks=2, days=5, hours=10,
minutes=20, seconds=6.73,
milliseconds=99, microseconds=8)
print(td)
19 days, 10:20:06.829008
(3)手順(1)のdate、datetimeオブジェクトに、手順2のtimedeltaを加える
print('new date is', date + td)
print('new datetime is', dt + td)
new date is 2022-01-25 new datetime is 2022-01-25 17:20:07.292206
timedeltaを加算すると、dateは日付、datetimeは日付日時詳細まで表示(4)timeオブジェクトにtimedeltaを加算することはできない
time + td
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-23-1bc8485497ad> in <module> ----> 1 time + td TypeError: unsupported operand type(s) for +: 'datetime.time' and 'datetime.timedelta'
(5)pandasのナノ秒精度のTimestampオブジェクトは柔軟で広範囲の入力に対応
pd.Timestamp(year=2022, month=1, day=6, hour=7, minute=15, second=8, microsecond=99)
# >>> Timestamp('2022-01-06 07:15:08.000099')
pd.Timestamp('2022/1/6')
# >>> Timestamp('2022-01-06 00:00:00')
pd.Timestamp('2022-1/6')
# >>> Timestamp('2022-01-06 00:00:00')
pd.Timestamp('Jan 6, 2022, 7:5')
# >>> Timestamp('2022-01-06 07:05:00')
Timestamp('2022-01-06 07:05:00')
(6)
display(pd.Timestamp(300))
display(pd.Timestamp(365*3, unit='D'))
Timestamp('1970-01-01 00:00:00.000000300')
Timestamp('1972-12-31 00:00:00')
(7)
to_datetime関数があるTimestampコンストラクタと同じ働きをするが、特殊な状況を扱うパラメータが少し異なるpd.to_datetime('2022-1-6')
# >>> Timestamp('2022-01-06 00:00:00')
pd.to_datetime('2022-1-6', dayfirst=True)
# >>> Timestamp('2022-01-06 00:00:00')
# 文字列がpandasで自動認識できない特別な日付書式のときに以下が役立つ(書式は下記リンク参照)
pd.to_datetime('Start Date: Jan 22, 2022, Start Time: 1:30 pm',
format='Start Date: %b %d, %Y, Start Time: %I:%M %p')
# >>> Timestamp('2022-01-22 13:30:00')
pd.to_datetime(365/2, unit='D', origin='2022/1/1')
# >>> Timestamp('2022-07-02 12:00:00')
Timestamp('2022-07-02 12:00:00')
(8)
to_datetime関数にはさらに別の機能がある。文字列・整数のリスト・Series全体をTimestampに変換するto_datetimeを使うことが多いs = pd.Series([10, 100, 1000, 10000, 20000])
pd.to_datetime(s, unit='D', origin='1972/1/22')
0 1972-02-01 1 1972-05-01 2 1974-10-18 3 1999-06-09 4 2026-10-25 dtype: datetime64[ns]
s = pd.Series(['12-5-2022', '11-12-2022', '1/2/2022', '40/2/2022'])
# dayfirst:日付の解析順序、他にもあるので以下URL参照。coerceがないとエラーになる(NaT:Not a Time)
pd.to_datetime(s, dayfirst=True, errors='coerce')
0 2022-05-12 1 2022-12-11 2 2022-02-01 3 NaT dtype: datetime64[ns]
# これは何を表現しているのか?
pd.to_datetime(['Aug 3 2022 3:45:56', '2025/1-31'])
DatetimeIndex(['2022-08-03 03:45:56', '2025-01-31 00:00:00'], dtype='datetime64[ns]', freq=None)
(9)
to_datetime関数と同様に、pandasには時間量を表すTimedeltaとto_timedeltaがあるto_timedelta関数は、1つのTimedeltaオブジェクトを作るto_datetime同様、to_timedeltaのほうが機能が多く、リストやSeries全体をオブジェクトに変換するpd.Timedelta('12 days 5 hours 3 minutes 123456789 nanoseconds')
Timedelta('12 days 05:03:00.123456789')
pd.Timedelta(days=5, minutes=7.34)
Timedelta('5 days 00:07:20.400000')
pd.Timedelta(100, unit='W')
Timedelta('700 days 00:00:00')
pd.Timedelta('67:15:45.454')
Timedelta('2 days 19:15:45.454000')
s = pd.Series([10, 100])
pd.to_timedelta(s, unit='s')
0 0 days 00:00:10 1 0 days 00:01:40 dtype: timedelta64[ns]
time_strings = ['2 days 24 minutes 89.67 seconds', '00:45:23.6']
pd.to_timedelta(time_strings)
TimedeltaIndex(['2 days 00:25:29.670000', '0 days 00:45:23.600000'], dtype='timedelta64[ns]', freq=None)
(10)
pd.Timedelta('12 days 5 hours 3 minutes') * 2
Timedelta('24 days 10:06:00')
pd.Timestamp('1/1/2022') + pd.Timedelta('12 days 5 hours 3 minutes') * 2
Timestamp('2022-01-25 10:06:00')
td1 = pd.to_timedelta([10, 100], unit='s')
td2 = pd.to_timedelta(['3 hours', '4 hours'])
print(f'td1: {td1}')
print(f'td2: {td2}')
print(f'td1+td2: {td1 + td2}')
td1: TimedeltaIndex(['0 days 00:00:10', '0 days 00:01:40'], dtype='timedelta64[ns]', freq=None) td2: TimedeltaIndex(['0 days 03:00:00', '0 days 04:00:00'], dtype='timedelta64[ns]', freq=None) td1+td2: TimedeltaIndex(['0 days 03:00:10', '0 days 04:01:40'], dtype='timedelta64[ns]', freq=None)
pd.Timedelta('12 days') / pd.Timedelta('3 days')
4.0
(11)
ts = pd.Timestamp('2022-1-6 22:27:30')
ts.ceil('h')
Timestamp('2022-01-06 23:00:00')
ts.year, ts.month, ts.hour, ts.minute, ts.second
(2022, 1, 22, 27, 30)
ts.dayofweek
3
ts.dayofyear
6
ts.daysinmonth
31
ts.to_pydatetime()
datetime.datetime(2022, 1, 6, 22, 27, 30)
td = pd.Timedelta(125.8723, unit='h')
td
Timedelta('5 days 05:52:20.280000')
td.round('min')
Timedelta('5 days 05:52:00')
td.components
Components(days=5, hours=5, minutes=52, seconds=20, milliseconds=280, microseconds=0, nanoseconds=0)
td.total_seconds()
453140.28
(1)
HDF5形式のcrime.h5ファイルから読み込みcrime = pd.read_hdf(p / 'crime.h5', 'crime')
crime.dtypes
OFFENSE_TYPE_ID category OFFENSE_CATEGORY_ID category REPORTED_DATE datetime64[ns] GEO_LON float64 GEO_LAT float64 NEIGHBORHOOD_ID category IS_CRIME int64 IS_TRAFFIC int64 dtype: object
(2)
crime = crime.set_index('REPORTED_DATE')
crime
| OFFENSE_TYPE_ID | OFFENSE_CATEGORY_ID | GEO_LON | GEO_LAT | NEIGHBORHOOD_ID | IS_CRIME | IS_TRAFFIC | |
|---|---|---|---|---|---|---|---|
| REPORTED_DATE | |||||||
| 2014-06-29 02:01:00 | traffic-accident-dui-duid | traffic-accident | -105.000149 | 39.745753 | cbd | 0 | 1 |
| 2014-06-29 01:54:00 | vehicular-eluding-no-chase | all-other-crimes | -104.884660 | 39.738702 | east-colfax | 1 | 0 |
| 2014-06-29 02:00:00 | disturbing-the-peace | public-disorder | -105.020719 | 39.706674 | athmar-park | 1 | 0 |
| 2014-06-29 02:18:00 | curfew | public-disorder | -105.001552 | 39.769505 | sunnyside | 1 | 0 |
| 2014-06-29 04:17:00 | aggravated-assault | aggravated-assault | -105.018557 | 39.679229 | college-view-south-platte | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2017-09-13 05:48:00 | burglary-business-by-force | burglary | -105.033840 | 39.762365 | west-highland | 1 | 0 |
| 2017-09-12 20:37:00 | weapon-unlawful-discharge-of | all-other-crimes | -105.040313 | 39.721264 | barnum-west | 1 | 0 |
| 2017-09-12 16:32:00 | traf-habitual-offender | all-other-crimes | -104.847024 | 39.779596 | montbello | 1 | 0 |
| 2017-09-12 13:04:00 | criminal-mischief-other | public-disorder | -104.949183 | 39.756353 | skyland | 1 | 0 |
| 2017-09-12 09:30:00 | theft-other | larceny | -104.985739 | 39.735045 | capitol-hill | 1 | 0 |
460911 rows × 7 columns
(3)
crime.loc['2016-05-12 16:45:00']
| OFFENSE_TYPE_ID | OFFENSE_CATEGORY_ID | GEO_LON | GEO_LAT | NEIGHBORHOOD_ID | IS_CRIME | IS_TRAFFIC | |
|---|---|---|---|---|---|---|---|
| REPORTED_DATE | |||||||
| 2016-05-12 16:45:00 | traffic-accident | traffic-accident | -104.847024 | 39.779596 | montbello | 0 | 1 |
| 2016-05-12 16:45:00 | traffic-accident | traffic-accident | -105.049180 | 39.769296 | west-highland | 0 | 1 |
| 2016-05-12 16:45:00 | fraud-identity-theft | white-collar-crime | -104.931971 | 39.717359 | hilltop | 1 | 0 |
(4)
Timestampなら、部分マッチングする行をすべて選ぶことができるcrime.loc['2016-05-12']
| OFFENSE_TYPE_ID | OFFENSE_CATEGORY_ID | GEO_LON | GEO_LAT | NEIGHBORHOOD_ID | IS_CRIME | IS_TRAFFIC | |
|---|---|---|---|---|---|---|---|
| REPORTED_DATE | |||||||
| 2016-05-12 23:51:00 | criminal-mischief-other | public-disorder | -105.017241 | 39.705845 | athmar-park | 1 | 0 |
| 2016-05-12 18:40:00 | liquor-possession | drug-alcohol | -104.995692 | 39.747875 | cbd | 1 | 0 |
| 2016-05-12 22:26:00 | traffic-accident | traffic-accident | -104.880037 | 39.777037 | stapleton | 0 | 1 |
| 2016-05-12 20:35:00 | theft-bicycle | larceny | -104.929350 | 39.763797 | northeast-park-hill | 1 | 0 |
| 2016-05-12 09:39:00 | theft-of-motor-vehicle | auto-theft | -104.941233 | 39.775510 | elyria-swansea | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2016-05-12 17:55:00 | public-peace-other | public-disorder | -105.027747 | 39.700029 | westwood | 1 | 0 |
| 2016-05-12 19:24:00 | threats-to-injure | public-disorder | -104.947118 | 39.763777 | clayton | 1 | 0 |
| 2016-05-12 22:28:00 | sex-aslt-rape | sexual-assault | NaN | NaN | harvey-park-south | 1 | 0 |
| 2016-05-12 15:59:00 | menacing-felony-w-weap | aggravated-assault | -104.935172 | 39.723703 | hilltop | 1 | 0 |
| 2016-05-12 16:39:00 | assault-dv | other-crimes-against-persons | -104.974700 | 39.740555 | north-capitol-hill | 1 | 0 |
243 rows × 7 columns
(5)1つの日付だけでなく、ある月、年、時間で選ぶことができる
crime.loc['2016-5'].shape
(8012, 7)
crime.loc['2016'].shape
(91076, 7)
crime.loc['2016-5-12 03']
| OFFENSE_TYPE_ID | OFFENSE_CATEGORY_ID | GEO_LON | GEO_LAT | NEIGHBORHOOD_ID | IS_CRIME | IS_TRAFFIC | |
|---|---|---|---|---|---|---|---|
| REPORTED_DATE | |||||||
| 2016-05-12 03:11:00 | theft-shoplift | larceny | -105.020586 | 39.678224 | college-view-south-platte | 1 | 0 |
| 2016-05-12 03:43:00 | traf-other | all-other-crimes | -104.892862 | 39.740153 | east-colfax | 1 | 0 |
| 2016-05-12 03:18:00 | criminal-trespassing | all-other-crimes | -104.990480 | 39.728076 | lincoln-park | 1 | 0 |
| 2016-05-12 03:13:00 | assault-simple | other-crimes-against-persons | -104.977202 | 39.739745 | capitol-hill | 1 | 0 |
(6)選択文字列は、月の英字名でもよい
crime.loc['Dec 2015'].sort_index()
| OFFENSE_TYPE_ID | OFFENSE_CATEGORY_ID | GEO_LON | GEO_LAT | NEIGHBORHOOD_ID | IS_CRIME | IS_TRAFFIC | |
|---|---|---|---|---|---|---|---|
| REPORTED_DATE | |||||||
| 2015-12-01 00:48:00 | drug-cocaine-possess | drug-alcohol | -104.891681 | 39.740155 | east-colfax | 1 | 0 |
| 2015-12-01 00:48:00 | theft-of-motor-vehicle | auto-theft | -104.891681 | 39.740155 | east-colfax | 1 | 0 |
| 2015-12-01 01:00:00 | criminal-mischief-other | public-disorder | -104.993574 | 39.739773 | civic-center | 1 | 0 |
| 2015-12-01 01:10:00 | traf-other | all-other-crimes | -104.995496 | 39.725618 | baker | 1 | 0 |
| 2015-12-01 01:10:00 | traf-habitual-offender | all-other-crimes | -104.995496 | 39.725618 | baker | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2015-12-31 23:35:00 | drug-cocaine-possess | drug-alcohol | -104.988019 | 39.753420 | five-points | 1 | 0 |
| 2015-12-31 23:40:00 | traffic-accident | traffic-accident | -104.945076 | 39.736768 | congress-park | 0 | 1 |
| 2015-12-31 23:44:00 | drug-cocaine-possess | drug-alcohol | -104.966814 | 39.746460 | city-park-west | 1 | 0 |
| 2015-12-31 23:45:00 | violation-of-restraining-order | all-other-crimes | -105.034887 | 39.741827 | west-colfax | 1 | 0 |
| 2015-12-31 23:50:00 | weapon-poss-illegal-dangerous | all-other-crimes | -105.032769 | 39.709188 | westwood | 1 | 0 |
6907 rows × 7 columns
(7)月の形式にはさまざまなパターンがある
crime.loc['2016 Sep, 15'].shape
(252, 7)
crime.loc['21st October 2014 05'].shape
(4, 7)
(8)スライス表記を使って、データの範囲を指定する
crime.sort_index().loc['2015-3-4':'2016-3-5']
| OFFENSE_TYPE_ID | OFFENSE_CATEGORY_ID | GEO_LON | GEO_LAT | NEIGHBORHOOD_ID | IS_CRIME | IS_TRAFFIC | |
|---|---|---|---|---|---|---|---|
| REPORTED_DATE | |||||||
| 2015-03-04 00:11:00 | assault-dv | other-crimes-against-persons | -105.021966 | 39.770883 | sunnyside | 1 | 0 |
| 2015-03-04 00:19:00 | assault-dv | other-crimes-against-persons | -104.978988 | 39.748799 | five-points | 1 | 0 |
| 2015-03-04 00:27:00 | theft-of-services | larceny | -105.055082 | 39.790564 | regis | 1 | 0 |
| 2015-03-04 00:49:00 | traffic-accident-hit-and-run | traffic-accident | -104.987454 | 39.701378 | washington-park-west | 0 | 1 |
| 2015-03-04 01:07:00 | burglary-business-no-force | burglary | -105.010843 | 39.762538 | highland | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2016-03-05 23:39:00 | traf-other | all-other-crimes | -104.890249 | 39.738849 | east-colfax | 1 | 0 |
| 2016-03-05 23:40:00 | public-order-crimes-other | all-other-crimes | -104.995367 | 39.741134 | cbd | 1 | 0 |
| 2016-03-05 23:51:00 | robbery-street | robbery | -105.033273 | 39.740214 | west-colfax | 1 | 0 |
| 2016-03-05 23:52:00 | traf-other | all-other-crimes | -105.053282 | 39.718429 | barnum-west | 1 | 0 |
| 2016-03-05 23:52:00 | traffic-accident-dui-duid | traffic-accident | -105.053282 | 39.718429 | barnum-west | 0 | 1 |
91114 rows × 7 columns
(9)
.locインデクサを使えば、すべての結果がこうなるcrime.sort_index().loc['2015-3-4 22':'2016-1-1 11:45:00']
| OFFENSE_TYPE_ID | OFFENSE_CATEGORY_ID | GEO_LON | GEO_LAT | NEIGHBORHOOD_ID | IS_CRIME | IS_TRAFFIC | |
|---|---|---|---|---|---|---|---|
| REPORTED_DATE | |||||||
| 2015-03-04 22:25:00 | traffic-accident-hit-and-run | traffic-accident | -104.973896 | 39.769064 | five-points | 0 | 1 |
| 2015-03-04 22:30:00 | traffic-accident | traffic-accident | -104.906412 | 39.632816 | hampden-south | 0 | 1 |
| 2015-03-04 22:32:00 | traffic-accident-hit-and-run | traffic-accident | -104.979180 | 39.706613 | washington-park-west | 0 | 1 |
| 2015-03-04 22:33:00 | traffic-accident-hit-and-run | traffic-accident | -104.991655 | 39.740067 | civic-center | 0 | 1 |
| 2015-03-04 22:36:00 | theft-unauth-use-of-ftd | white-collar-crime | -105.045235 | 39.667928 | harvey-park | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2016-01-01 11:10:00 | theft-of-motor-vehicle | auto-theft | -104.819450 | 39.791615 | montbello | 1 | 0 |
| 2016-01-01 11:11:00 | traffic-accident | traffic-accident | -104.981701 | 39.736814 | capitol-hill | 0 | 1 |
| 2016-01-01 11:11:00 | traffic-accident-hit-and-run | traffic-accident | -104.973731 | 39.714223 | speer | 0 | 1 |
| 2016-01-01 11:16:00 | traf-other | all-other-crimes | -104.977509 | 39.783541 | globeville | 1 | 0 |
| 2016-01-01 11:22:00 | traffic-accident | traffic-accident | -105.053250 | 39.731282 | villa-park | 0 | 1 |
75071 rows × 7 columns
DatetimeIndex型であれば、このレシピの全メソッドが使えるDatetimeIndexだけで働くDataFrame/Seriesメソッドが多数ある(1)REPORTED_DATEカラムをインデックスにしてDatetimeIndexになっていることを確認する
crime = pd.read_hdf(p / 'crime.h5', 'crime').set_index('REPORTED_DATE')
print(type(crime.index))
<class 'pandas.core.indexes.datetimes.DatetimeIndex'>
(2)between_timeを使い、午前2時から5時までに発生した全犯罪を日付と関係なく抽出
crime.between_time('2:00', '5:00', include_end=False)
| OFFENSE_TYPE_ID | OFFENSE_CATEGORY_ID | GEO_LON | GEO_LAT | NEIGHBORHOOD_ID | IS_CRIME | IS_TRAFFIC | |
|---|---|---|---|---|---|---|---|
| REPORTED_DATE | |||||||
| 2014-06-29 02:01:00 | traffic-accident-dui-duid | traffic-accident | -105.000149 | 39.745753 | cbd | 0 | 1 |
| 2014-06-29 02:00:00 | disturbing-the-peace | public-disorder | -105.020719 | 39.706674 | athmar-park | 1 | 0 |
| 2014-06-29 02:18:00 | curfew | public-disorder | -105.001552 | 39.769505 | sunnyside | 1 | 0 |
| 2014-06-29 04:17:00 | aggravated-assault | aggravated-assault | -105.018557 | 39.679229 | college-view-south-platte | 1 | 0 |
| 2014-06-29 04:22:00 | violation-of-restraining-order | all-other-crimes | -104.972447 | 39.739449 | cheesman-park | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2017-08-25 04:41:00 | theft-items-from-vehicle | theft-from-motor-vehicle | -104.880586 | 39.645164 | hampden-south | 1 | 0 |
| 2017-09-13 04:17:00 | theft-of-motor-vehicle | auto-theft | -105.028694 | 39.708288 | westwood | 1 | 0 |
| 2017-09-13 02:21:00 | assault-simple | other-crimes-against-persons | -104.925733 | 39.654184 | university-hills | 1 | 0 |
| 2017-09-13 03:21:00 | traffic-accident-dui-duid | traffic-accident | -105.010711 | 39.757385 | highland | 0 | 1 |
| 2017-09-13 02:15:00 | traffic-accident-hit-and-run | traffic-accident | -105.043950 | 39.787436 | regis | 0 | 1 |
29078 rows × 7 columns
cf.上記をdatetimeモジュールのtimeオブジェクトで表現した場合
import datetime
crime.between_time(datetime.time(2,0), datetime.time(5,0), include_end=False)
| OFFENSE_TYPE_ID | OFFENSE_CATEGORY_ID | GEO_LON | GEO_LAT | NEIGHBORHOOD_ID | IS_CRIME | IS_TRAFFIC | |
|---|---|---|---|---|---|---|---|
| REPORTED_DATE | |||||||
| 2014-06-29 02:01:00 | traffic-accident-dui-duid | traffic-accident | -105.000149 | 39.745753 | cbd | 0 | 1 |
| 2014-06-29 02:00:00 | disturbing-the-peace | public-disorder | -105.020719 | 39.706674 | athmar-park | 1 | 0 |
| 2014-06-29 02:18:00 | curfew | public-disorder | -105.001552 | 39.769505 | sunnyside | 1 | 0 |
| 2014-06-29 04:17:00 | aggravated-assault | aggravated-assault | -105.018557 | 39.679229 | college-view-south-platte | 1 | 0 |
| 2014-06-29 04:22:00 | violation-of-restraining-order | all-other-crimes | -104.972447 | 39.739449 | cheesman-park | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2017-08-25 04:41:00 | theft-items-from-vehicle | theft-from-motor-vehicle | -104.880586 | 39.645164 | hampden-south | 1 | 0 |
| 2017-09-13 04:17:00 | theft-of-motor-vehicle | auto-theft | -105.028694 | 39.708288 | westwood | 1 | 0 |
| 2017-09-13 02:21:00 | assault-simple | other-crimes-against-persons | -104.925733 | 39.654184 | university-hills | 1 | 0 |
| 2017-09-13 03:21:00 | traffic-accident-dui-duid | traffic-accident | -105.010711 | 39.757385 | highland | 0 | 1 |
| 2017-09-13 02:15:00 | traffic-accident-hit-and-run | traffic-accident | -105.043950 | 39.787436 | regis | 0 | 1 |
29078 rows × 7 columns
(3)at_timeで指定した時刻の日付をすべて取得
crime.at_time('5:47').head()
| OFFENSE_TYPE_ID | OFFENSE_CATEGORY_ID | GEO_LON | GEO_LAT | NEIGHBORHOOD_ID | IS_CRIME | IS_TRAFFIC | |
|---|---|---|---|---|---|---|---|
| REPORTED_DATE | |||||||
| 2013-11-26 05:47:00 | criminal-mischief-other | public-disorder | -104.991476 | 39.751536 | cbd | 1 | 0 |
| 2017-04-09 05:47:00 | criminal-mischief-mtr-veh | public-disorder | -104.959394 | 39.678425 | university | 1 | 0 |
| 2017-02-19 05:47:00 | criminal-mischief-other | public-disorder | -104.986767 | 39.741336 | north-capitol-hill | 1 | 0 |
| 2017-02-16 05:47:00 | aggravated-assault | aggravated-assault | -104.934029 | 39.732320 | hale | 1 | 0 |
| 2017-02-12 05:47:00 | police-interference | all-other-crimes | -104.976306 | 39.722644 | speer | 1 | 0 |
(4)
firstメソッドを使うpd.offsetsもジュールにあるcrime_sort = crime.sort_index()
crime_sort.first(pd.offsets.MonthBegin(6))
| OFFENSE_TYPE_ID | OFFENSE_CATEGORY_ID | GEO_LON | GEO_LAT | NEIGHBORHOOD_ID | IS_CRIME | IS_TRAFFIC | |
|---|---|---|---|---|---|---|---|
| REPORTED_DATE | |||||||
| 2012-01-02 00:06:00 | aggravated-assault | aggravated-assault | -104.816860 | 39.796717 | montbello | 1 | 0 |
| 2012-01-02 00:06:00 | violation-of-restraining-order | all-other-crimes | -104.816860 | 39.796717 | montbello | 1 | 0 |
| 2012-01-02 00:16:00 | traffic-accident-dui-duid | traffic-accident | -104.971851 | 39.736874 | cheesman-park | 0 | 1 |
| 2012-01-02 00:47:00 | traffic-accident | traffic-accident | -104.951824 | 39.740168 | congress-park | 0 | 1 |
| 2012-01-02 01:35:00 | aggravated-assault | aggravated-assault | -104.998131 | 39.749922 | union-station | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2012-06-30 23:40:00 | traffic-accident-dui-duid | traffic-accident | -104.922266 | 39.678471 | goldsmith | 0 | 1 |
| 2012-06-30 23:44:00 | traffic-accident | traffic-accident | -104.987578 | 39.711158 | baker | 0 | 1 |
| 2012-06-30 23:50:00 | criminal-mischief-mtr-veh | public-disorder | -104.838271 | 39.788683 | montbello | 1 | 0 |
| 2012-06-30 23:54:00 | traffic-accident-hit-and-run | traffic-accident | -105.014162 | 39.740439 | lincoln-park | 0 | 1 |
| 2012-07-01 00:01:00 | robbery-street | robbery | -104.924292 | 39.767585 | northeast-park-hill | 1 | 0 |
27489 rows × 7 columns
cf) 以下でわかるように起点の「first_date」は、日時分から開始されている
first_date = crime_sort.index[0]
first_date
Timestamp('2012-01-02 00:06:00')
first_date + pd.offsets.MonthBegin(6)
Timestamp('2012-07-01 00:06:00')
first_date + pd.offsets.MonthEnd(6)
Timestamp('2012-06-30 00:06:00')
(5)
MonthEndを使ってみるcrime_sort.first(pd.offsets.MonthEnd(6))
| OFFENSE_TYPE_ID | OFFENSE_CATEGORY_ID | GEO_LON | GEO_LAT | NEIGHBORHOOD_ID | IS_CRIME | IS_TRAFFIC | |
|---|---|---|---|---|---|---|---|
| REPORTED_DATE | |||||||
| 2012-01-02 00:06:00 | aggravated-assault | aggravated-assault | -104.816860 | 39.796717 | montbello | 1 | 0 |
| 2012-01-02 00:06:00 | violation-of-restraining-order | all-other-crimes | -104.816860 | 39.796717 | montbello | 1 | 0 |
| 2012-01-02 00:16:00 | traffic-accident-dui-duid | traffic-accident | -104.971851 | 39.736874 | cheesman-park | 0 | 1 |
| 2012-01-02 00:47:00 | traffic-accident | traffic-accident | -104.951824 | 39.740168 | congress-park | 0 | 1 |
| 2012-01-02 01:35:00 | aggravated-assault | aggravated-assault | -104.998131 | 39.749922 | union-station | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2012-06-29 23:01:00 | aggravated-assault | aggravated-assault | -105.019673 | 39.733250 | sun-valley | 1 | 0 |
| 2012-06-29 23:11:00 | traffic-accident | traffic-accident | -104.918451 | 39.653080 | southmoor-park | 0 | 1 |
| 2012-06-29 23:41:00 | robbery-street | robbery | -104.991912 | 39.756163 | five-points | 1 | 0 |
| 2012-06-29 23:57:00 | assault-simple | other-crimes-against-persons | -104.987360 | 39.715162 | speer | 1 | 0 |
| 2012-06-30 00:04:00 | traffic-accident | traffic-accident | -104.894697 | 39.628902 | hampden-south | 0 | 1 |
27332 rows × 7 columns
(6)
normalizeパラメータを使うと、時間要素がゼロになり、2012-01-crime_sort.first(pd.offsets.MonthBegin(6, normalize=True))
| OFFENSE_TYPE_ID | OFFENSE_CATEGORY_ID | GEO_LON | GEO_LAT | NEIGHBORHOOD_ID | IS_CRIME | IS_TRAFFIC | |
|---|---|---|---|---|---|---|---|
| REPORTED_DATE | |||||||
| 2012-01-02 00:06:00 | aggravated-assault | aggravated-assault | -104.816860 | 39.796717 | montbello | 1 | 0 |
| 2012-01-02 00:06:00 | violation-of-restraining-order | all-other-crimes | -104.816860 | 39.796717 | montbello | 1 | 0 |
| 2012-01-02 00:16:00 | traffic-accident-dui-duid | traffic-accident | -104.971851 | 39.736874 | cheesman-park | 0 | 1 |
| 2012-01-02 00:47:00 | traffic-accident | traffic-accident | -104.951824 | 39.740168 | congress-park | 0 | 1 |
| 2012-01-02 01:35:00 | aggravated-assault | aggravated-assault | -104.998131 | 39.749922 | union-station | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2012-06-30 23:40:00 | traffic-accident-hit-and-run | traffic-accident | -104.922266 | 39.678471 | goldsmith | 0 | 1 |
| 2012-06-30 23:40:00 | traffic-accident-dui-duid | traffic-accident | -104.922266 | 39.678471 | goldsmith | 0 | 1 |
| 2012-06-30 23:44:00 | traffic-accident | traffic-accident | -104.987578 | 39.711158 | baker | 0 | 1 |
| 2012-06-30 23:50:00 | criminal-mischief-mtr-veh | public-disorder | -104.838271 | 39.788683 | montbello | 1 | 0 |
| 2012-06-30 23:54:00 | traffic-accident-hit-and-run | traffic-accident | -105.014162 | 39.740439 | lincoln-park | 0 | 1 |
27488 rows × 7 columns
(7) (6)をもっと簡単な方法で、最初の6ヶ月分のデータをスライシングできる
crime_sort.loc[:'2012-06']
| OFFENSE_TYPE_ID | OFFENSE_CATEGORY_ID | GEO_LON | GEO_LAT | NEIGHBORHOOD_ID | IS_CRIME | IS_TRAFFIC | |
|---|---|---|---|---|---|---|---|
| REPORTED_DATE | |||||||
| 2012-01-02 00:06:00 | aggravated-assault | aggravated-assault | -104.816860 | 39.796717 | montbello | 1 | 0 |
| 2012-01-02 00:06:00 | violation-of-restraining-order | all-other-crimes | -104.816860 | 39.796717 | montbello | 1 | 0 |
| 2012-01-02 00:16:00 | traffic-accident-dui-duid | traffic-accident | -104.971851 | 39.736874 | cheesman-park | 0 | 1 |
| 2012-01-02 00:47:00 | traffic-accident | traffic-accident | -104.951824 | 39.740168 | congress-park | 0 | 1 |
| 2012-01-02 01:35:00 | aggravated-assault | aggravated-assault | -104.998131 | 39.749922 | union-station | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2012-06-30 23:40:00 | traffic-accident-hit-and-run | traffic-accident | -104.922266 | 39.678471 | goldsmith | 0 | 1 |
| 2012-06-30 23:40:00 | traffic-accident-dui-duid | traffic-accident | -104.922266 | 39.678471 | goldsmith | 0 | 1 |
| 2012-06-30 23:44:00 | traffic-accident | traffic-accident | -104.987578 | 39.711158 | baker | 0 | 1 |
| 2012-06-30 23:50:00 | criminal-mischief-mtr-veh | public-disorder | -104.838271 | 39.788683 | montbello | 1 | 0 |
| 2012-06-30 23:54:00 | traffic-accident-hit-and-run | traffic-accident | -105.014162 | 39.740439 | lincoln-park | 0 | 1 |
27488 rows × 7 columns
crime_sort.first('5D') # 5日
crime_sort.first('5B') # 5勤務日
crime_sort.first('7W') # 7週、週末は日曜
crime_sort.first('3QS') # 第3四半期の開始
crime_sort.first('A') # 1年の終わり
resampleとgroupbyを使って、週ごとの犯罪件数をカウントする(1)crime hdf5データセットを読み込み、インデックスをREORTED_DATEに設定、ソートしておく
crime_sort = pd.read_hdf(p / 'crime.h5', 'crime').set_index('REPORTED_DATE').sort_index()
crime_sort.resample('W')
<pandas.core.resample.DatetimeIndexResampler object at 0x7fb5224025e0>
cf.resampleのメソッドは以下がある
r = crime_sort.resample('W')
resample_methods = [attr for attr in dir(r) if attr[0].islower()]
print(resample_methods)
['agg', 'aggregate', 'apply', 'asfreq', 'ax', 'backfill', 'bfill', 'binner', 'count', 'ffill', 'fillna', 'first', 'get_group', 'groupby', 'groups', 'indices', 'interpolate', 'kind', 'last', 'max', 'mean', 'median', 'min', 'ndim', 'nearest', 'ngroups', 'nunique', 'ohlc', 'pad', 'pipe', 'plot', 'prod', 'quantile', 'sem', 'size', 'std', 'sum', 'transform', 'var']
(3)週ごとの犯罪件数
weekly_crimes = crime_sort.resample('W').size()
weekly_crimes.head()
REPORTED_DATE 2012-01-08 877 2012-01-15 1071 2012-01-22 991 2012-01-29 988 2012-02-05 888 Freq: W-SUN, dtype: int64
(4)(3)が正しいかを確認
len(crime_sort.loc[:'2012-1-8'])
877
len(crime_sort.loc['2012-1-9':'2012-1-15'])
1071
(5)アンカーオフセットを使い、日曜以外で週を終わるようにした結果をみる
crime_sort.resample('W-THU').size().head()
REPORTED_DATE 2012-01-05 462 2012-01-12 1116 2012-01-19 924 2012-01-26 1061 2012-02-02 926 Freq: W-THU, dtype: int64
(6)
resampleはgroupbyメソッドでも対応可能pd.Grouperオブジェクトで渡す必要があるpd.TimeGrouperというよく似たオブジェクトがあるが使わないことweekly_crimes_gby = crime_sort.groupby(pd.Grouper(freq='W')).size()
weekly_crimes_gby.head()
REPORTED_DATE 2012-01-08 877 2012-01-15 1071 2012-01-22 991 2012-01-29 988 2012-02-05 888 Freq: W-SUN, dtype: int64
補足: TimeStampの列をインデックスにセットしたくない場合
Timestampをセットせずとも、実はresampleを使うことができるcrime = pd.read_hdf(p / 'crime.h5', 'crime')
weekly_crimes2 = crime.resample('W', on='REPORTED_DATE').size()
print(weekly_crimes2.equals(weekly_crimes))
weekly_crimes2
True
REPORTED_DATE
2012-01-08 877
2012-01-15 1071
2012-01-22 991
2012-01-29 988
2012-02-05 888
...
2017-09-03 1956
2017-09-10 1733
2017-09-17 1976
2017-09-24 1839
2017-10-01 1059
Freq: W-SUN, Length: 300, dtype: int64
keyパラメータでTimestampを選び、pd.Grouperでgroupbyを使えるweekly_crimes_gby2 = crime.groupby(pd.Grouper(key='REPORTED_DATE', freq='W')).size()
weekly_crimes_gby2.equals(weekly_crimes)
True
weekly_crimes.plot(figsize=(16, 4), title='All Denver Crimes')
<AxesSubplot:title={'center':'All Denver Crimes'}, xlabel='REPORTED_DATE'>
resampleは期間でのグループ分けと特定から無で別々の集約をサポートするresampleを使って、四半期のグループ分けをし、犯罪と交通事故を別々に数える(1)読み込み
crime_sort = pd.read_hdf(p / 'crime.h5', 'crime').set_index('REPORTED_DATE').sort_index()
(2)resampleで四半期のグループ分け後、各グループのIS_CRIMEとIS_TRAFFICの和を別々に取る
crime_quartely = crime_sort.resample('Q')['IS_CRIME', 'IS_TRAFFIC'].sum()
crime_quartely.head()
| IS_CRIME | IS_TRAFFIC | |
|---|---|---|
| REPORTED_DATE | ||
| 2012-03-31 | 7882 | 4726 |
| 2012-06-30 | 9641 | 5255 |
| 2012-09-30 | 10566 | 5003 |
| 2012-12-31 | 9197 | 4802 |
| 2013-03-31 | 8730 | 4442 |
(3)
Q」は最終日を取得する)QS」を使い、四半期初日にするcrime_sort.resample('QS')['IS_CRIME', 'IS_TRAFFIC'].sum().head()
| IS_CRIME | IS_TRAFFIC | |
|---|---|---|
| REPORTED_DATE | ||
| 2012-01-01 | 7882 | 4726 |
| 2012-04-01 | 9641 | 5255 |
| 2012-07-01 | 10566 | 5003 |
| 2012-10-01 | 9197 | 4802 |
| 2013-01-01 | 8730 | 4442 |
(4)結果を第2四半期データが正しいかどうかで検証する
crime_sort.loc['2012-4-1':'2012-6-30', ['IS_CRIME', 'IS_TRAFFIC']].sum()
IS_CRIME 9641 IS_TRAFFIC 5255 dtype: int64
(5)groupbyでも(2)と同じ結果が得られる(FutureWarning出るが)
crime_quarterly2 = crime_sort.groupby(pd.Grouper(freq='Q'))['IS_CRIME', 'IS_TRAFFIC'].sum()
print(crime_quarterly2.equals(crime_quartely))
crime_quarterly2.head()
True
<ipython-input-94-7d960d7483e6>:1: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead. crime_quarterly2 = crime_sort.groupby(pd.Grouper(freq='Q'))['IS_CRIME', 'IS_TRAFFIC'].sum()
| IS_CRIME | IS_TRAFFIC | |
|---|---|---|
| REPORTED_DATE | ||
| 2012-03-31 | 7882 | 4726 |
| 2012-06-30 | 9641 | 5255 |
| 2012-09-30 | 10566 | 5003 |
| 2012-12-31 | 9197 | 4802 |
| 2013-03-31 | 8730 | 4442 |
(6)犯罪と交通事故の傾向を分析するためにプロットする
plot_kwargs = dict(figsize=(16,4), color=['black', 'lightgrey'], \
title='Denver Crimes and Traffic Accidents')
crime_quartely.plot(**plot_kwargs)
<AxesSubplot:title={'center':'Denver Crimes and Traffic Accidents'}, xlabel='REPORTED_DATE'>
補足:犯罪と交通事故の増加パーセントをプロットする
crime_begin = crime_quartely.iloc[0]
crime_begin
IS_CRIME 7882 IS_TRAFFIC 4726 Name: 2012-03-31 00:00:00, dtype: int64
df.sub() ・・・以下では1を引いているcrime_quarterly2 = crime_quartely.div(crime_begin) \
.sub(1) \
.round(2)
crime_quarterly2.head()
| IS_CRIME | IS_TRAFFIC | |
|---|---|---|
| REPORTED_DATE | ||
| 2012-03-31 | 0.00 | 0.00 |
| 2012-06-30 | 0.22 | 0.11 |
| 2012-09-30 | 0.34 | 0.06 |
| 2012-12-31 | 0.17 | 0.02 |
| 2013-03-31 | 0.11 | -0.06 |
crime_quarterly2.plot(**plot_kwargs)
<AxesSubplot:title={'center':'Denver Crimes and Traffic Accidents'}, xlabel='REPORTED_DATE'>
Timestampから直接抽出する機能が必要Timestampのdtアクセサに組み込まれている(1)読み込む、REPORTED_DATEはそのまま
crime = pd.read_hdf(p / 'crime.h5', 'crime')
crime.head()
| OFFENSE_TYPE_ID | OFFENSE_CATEGORY_ID | REPORTED_DATE | GEO_LON | GEO_LAT | NEIGHBORHOOD_ID | IS_CRIME | IS_TRAFFIC | |
|---|---|---|---|---|---|---|---|---|
| 0 | traffic-accident-dui-duid | traffic-accident | 2014-06-29 02:01:00 | -105.000149 | 39.745753 | cbd | 0 | 1 |
| 1 | vehicular-eluding-no-chase | all-other-crimes | 2014-06-29 01:54:00 | -104.884660 | 39.738702 | east-colfax | 1 | 0 |
| 2 | disturbing-the-peace | public-disorder | 2014-06-29 02:00:00 | -105.020719 | 39.706674 | athmar-park | 1 | 0 |
| 3 | curfew | public-disorder | 2014-06-29 02:18:00 | -105.001552 | 39.769505 | sunnyside | 1 | 0 |
| 4 | aggravated-assault | aggravated-assault | 2014-06-29 04:17:00 | -105.018557 | 39.679229 | college-view-south-platte | 1 | 0 |
(2)
Timestampカラムにはdtアクセサという属性があり、様々な属性やメソッドにアクセスできるwd_counts = crime['REPORTED_DATE'].dt.weekday_name.value_counts()
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-103-1f553e3627dd> in <module> ----> 1 wd_counts = crime['REPORTED_DATE'].dt.weekday_name.value_counts() AttributeError: 'DatetimeProperties' object has no attribute 'weekday_name'
注意:weekday_nameは使えない。今はday_name()
wd_counts = crime['REPORTED_DATE'].dt.day_name().value_counts()
wd_counts
Monday 70024 Friday 69621 Wednesday 69538 Thursday 69287 Tuesday 68394 Saturday 58834 Sunday 55213 Name: REPORTED_DATE, dtype: int64
(3)週末は犯罪も交通事故も少ない。データを整理して棒グラフ化
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
title = 'Denver Crimes and Traffic Accidents per Weekday'
wd_counts.reindex(days).plot(kind='barh', title=title)
<AxesSubplot:title={'center':'Denver Crimes and Traffic Accidents per Weekday'}>
(4)年ごとの件数を数える
title = 'Denver Crimes and Traffic Accidents per Year'
crime['REPORTED_DATE'].dt.year.value_counts()\
.sort_index()\
.plot(kind='barh', title=title)
<AxesSubplot:title={'center':'Denver Crimes and Traffic Accidents per Year'}>