import pathlib
import pandas as pd
import numpy as np
p = pathlib.Path('〜〜〜')
college = pd.read_csv(p / 'college.csv')
college.head()
| INSTNM | CITY | STABBR | HBCU | MENONLY | WOMENONLY | RELAFFIL | SATVRMID | SATMTMID | DISTANCEONLY | ... | UGDS_2MOR | UGDS_NRA | UGDS_UNKN | PPTUG_EF | CURROPER | PCTPELL | PCTFLOAN | UG25ABV | MD_EARN_WNE_P10 | GRAD_DEBT_MDN_SUPP | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Alabama A & M University | Normal | AL | 1.0 | 0.0 | 0.0 | 0 | 424.0 | 420.0 | 0.0 | ... | 0.0000 | 0.0059 | 0.0138 | 0.0656 | 1 | 0.7356 | 0.8284 | 0.1049 | 30300 | 33888 |
| 1 | University of Alabama at Birmingham | Birmingham | AL | 0.0 | 0.0 | 0.0 | 0 | 570.0 | 565.0 | 0.0 | ... | 0.0368 | 0.0179 | 0.0100 | 0.2607 | 1 | 0.3460 | 0.5214 | 0.2422 | 39700 | 21941.5 |
| 2 | Amridge University | Montgomery | AL | 0.0 | 0.0 | 0.0 | 1 | NaN | NaN | 1.0 | ... | 0.0000 | 0.0000 | 0.2715 | 0.4536 | 1 | 0.6801 | 0.7795 | 0.8540 | 40100 | 23370 |
| 3 | University of Alabama in Huntsville | Huntsville | AL | 0.0 | 0.0 | 0.0 | 0 | 595.0 | 590.0 | 0.0 | ... | 0.0172 | 0.0332 | 0.0350 | 0.2146 | 1 | 0.3072 | 0.4596 | 0.2640 | 45500 | 24097 |
| 4 | Alabama State University | Montgomery | AL | 1.0 | 0.0 | 0.0 | 0 | 425.0 | 430.0 | 0.0 | ... | 0.0098 | 0.0243 | 0.0137 | 0.0892 | 1 | 0.7347 | 0.7554 | 0.1270 | 26600 | 33118.5 |
5 rows × 27 columns
describe()は、デフォルトではカテゴリカラムはスルーされる。数値カラムだけの要約を出力する.Tで転置すると見やすいnp.numberでも'number'でも同様int, floatを包括college.describe(include=[np.number]).T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| HBCU | 7164.0 | 0.014238 | 0.118478 | 0.0 | 0.000000 | 0.00000 | 0.000000 | 1.0000 |
| MENONLY | 7164.0 | 0.009213 | 0.095546 | 0.0 | 0.000000 | 0.00000 | 0.000000 | 1.0000 |
| WOMENONLY | 7164.0 | 0.005304 | 0.072642 | 0.0 | 0.000000 | 0.00000 | 0.000000 | 1.0000 |
| RELAFFIL | 7535.0 | 0.190975 | 0.393096 | 0.0 | 0.000000 | 0.00000 | 0.000000 | 1.0000 |
| SATVRMID | 1185.0 | 522.819409 | 68.578862 | 290.0 | 475.000000 | 510.00000 | 555.000000 | 765.0000 |
| SATMTMID | 1196.0 | 530.765050 | 73.469767 | 310.0 | 482.000000 | 520.00000 | 565.000000 | 785.0000 |
| DISTANCEONLY | 7164.0 | 0.005583 | 0.074519 | 0.0 | 0.000000 | 0.00000 | 0.000000 | 1.0000 |
| UGDS | 6874.0 | 2356.837940 | 5474.275871 | 0.0 | 117.000000 | 412.50000 | 1929.500000 | 151558.0000 |
| UGDS_WHITE | 6874.0 | 0.510207 | 0.286958 | 0.0 | 0.267500 | 0.55570 | 0.747875 | 1.0000 |
| UGDS_BLACK | 6874.0 | 0.189997 | 0.224587 | 0.0 | 0.036125 | 0.10005 | 0.257700 | 1.0000 |
| UGDS_HISP | 6874.0 | 0.161635 | 0.221854 | 0.0 | 0.027600 | 0.07140 | 0.198875 | 1.0000 |
| UGDS_ASIAN | 6874.0 | 0.033544 | 0.073777 | 0.0 | 0.002500 | 0.01290 | 0.032700 | 0.9727 |
| UGDS_AIAN | 6874.0 | 0.013813 | 0.070196 | 0.0 | 0.000000 | 0.00260 | 0.007300 | 1.0000 |
| UGDS_NHPI | 6874.0 | 0.004569 | 0.033125 | 0.0 | 0.000000 | 0.00000 | 0.002500 | 0.9983 |
| UGDS_2MOR | 6874.0 | 0.023950 | 0.031288 | 0.0 | 0.000000 | 0.01750 | 0.033900 | 0.5333 |
| UGDS_NRA | 6874.0 | 0.016086 | 0.050172 | 0.0 | 0.000000 | 0.00000 | 0.011700 | 0.9286 |
| UGDS_UNKN | 6874.0 | 0.045181 | 0.093440 | 0.0 | 0.000000 | 0.01430 | 0.045400 | 0.9027 |
| PPTUG_EF | 6853.0 | 0.226639 | 0.246470 | 0.0 | 0.000000 | 0.15040 | 0.376900 | 1.0000 |
| CURROPER | 7535.0 | 0.923291 | 0.266146 | 0.0 | 1.000000 | 1.00000 | 1.000000 | 1.0000 |
| PCTPELL | 6849.0 | 0.530643 | 0.225544 | 0.0 | 0.357800 | 0.52150 | 0.712900 | 1.0000 |
| PCTFLOAN | 6849.0 | 0.522211 | 0.283616 | 0.0 | 0.332900 | 0.58330 | 0.745000 | 1.0000 |
| UG25ABV | 6718.0 | 0.410021 | 0.228939 | 0.0 | 0.241500 | 0.40075 | 0.572275 | 1.0000 |
np.object'object'またはpd.Categoricalとなる# college.describe(include=[np.object, pd.Categorical]).T
college.describe(include=['object', pd.Categorical]).T
| count | unique | top | freq | |
|---|---|---|---|---|
| INSTNM | 7535 | 7535 | Johnson County Community College | 1 |
| CITY | 7535 | 2514 | New York | 87 |
| STABBR | 7535 | 59 | CA | 773 |
| MD_EARN_WNE_P10 | 6413 | 598 | PrivacySuppressed | 822 |
| GRAD_DEBT_MDN_SUPP | 7503 | 2038 | PrivacySuppressed | 1510 |
different_cols = ['RELAFFIL', 'SATMTMID', 'CURROPER', 'INSTNM', 'STABBR']
col2 = college[different_cols]
col2
| RELAFFIL | SATMTMID | CURROPER | INSTNM | STABBR | |
|---|---|---|---|---|---|
| 0 | 0 | 420.0 | 1 | Alabama A & M University | AL |
| 1 | 0 | 565.0 | 1 | University of Alabama at Birmingham | AL |
| 2 | 1 | NaN | 1 | Amridge University | AL |
| 3 | 0 | 590.0 | 1 | University of Alabama in Huntsville | AL |
| 4 | 0 | 430.0 | 1 | Alabama State University | AL |
| ... | ... | ... | ... | ... | ... |
| 7530 | 1 | NaN | 1 | SAE Institute of Technology San Francisco | CA |
| 7531 | 1 | NaN | 1 | Rasmussen College - Overland Park | KS |
| 7532 | 1 | NaN | 1 | National Personal Training Institute of Cleveland | OH |
| 7533 | 1 | NaN | 1 | Bay Area Medical Academy - San Jose Satellite ... | CA |
| 7534 | 1 | NaN | 1 | Excel Learning Center-San Antonio South | TX |
7535 rows × 5 columns
# テキスト通りにしてみる <- たしかにちがう
col2 = college.loc[:, different_cols]
col2.dtypes
RELAFFIL int64 SATMTMID float64 CURROPER int64 INSTNM object STABBR object dtype: object
# 各カラムのメモリ使用量
original_mem = col2.memory_usage(deep=True)
original_mem
Index 128 RELAFFIL 60280 SATMTMID 60280 CURROPER 60280 INSTNM 660240 STABBR 444565 dtype: int64
# RELAFILは0or1のみだから64bit整数は無駄なので8bitINTに変換
col2['RELAFFIL'] = col2['RELAFFIL'].astype('int8')
# col2['RELAFFIL'] = col2['RELAFFIL'].astype(np.int8)
col2.dtypes
RELAFFIL int8 SATMTMID float64 CURROPER int64 INSTNM object STABBR object dtype: object
col2.memory_usage(deep=True)
# RELAFFILは元の13%まで削減できた
Index 128 RELAFFIL 7535 SATMTMID 60280 CURROPER 60280 INSTNM 660240 STABBR 444565 dtype: int64
# さらにメモリ使用量を削減したい。object型のユニーク数が少ないものを調べる
# 8bit=256, 16bit=65536
col2.select_dtypes(include=['object']).nunique()
INSTNM 7535 STABBR 59 dtype: int64
col2['STABBR'] = col2['STABBR'].astype('category')
col2.dtypes
RELAFFIL int8 SATMTMID float64 CURROPER int64 INSTNM object STABBR category dtype: object
col2.memory_usage(deep=True)
# STABBRは元の約3%まで削減できたい
Index 128 RELAFFIL 7535 SATMTMID 60280 CURROPER 60280 INSTNM 660699 STABBR 13120 dtype: int64
nlargest(int, col_name)nsmallest(int, col_name)movie = pd.read_csv(p / 'movie.csv')
movie2 = movie[['movie_title', 'imdb_score', 'budget']]
movie2.head()
| movie_title | imdb_score | budget | |
|---|---|---|---|
| 0 | Avatar | 7.9 | 237000000.0 |
| 1 | Pirates of the Caribbean: At World's End | 7.1 | 300000000.0 |
| 2 | Spectre | 6.8 | 245000000.0 |
| 3 | The Dark Knight Rises | 8.5 | 250000000.0 |
| 4 | Star Wars: Episode VII - The Force Awakens | 7.1 | NaN |
# imdb_scoreカラムの上位100位の映画(nlargestメソッド)
movie2.nlargest(100, 'imdb_score').head()
| movie_title | imdb_score | budget | |
|---|---|---|---|
| 2725 | Towering Inferno | 9.5 | NaN |
| 1920 | The Shawshank Redemption | 9.3 | 25000000.0 |
| 3402 | The Godfather | 9.2 | 6000000.0 |
| 2779 | Dekalog | 9.1 | NaN |
| 4312 | Kickboxer: Vengeance | 9.1 | 17000000.0 |
# 上位100の映画の中から最安値の5本(nsmallest)
movie2.nlargest(100, 'imdb_score').nsmallest(5, 'budget')
| movie_title | imdb_score | budget | |
|---|---|---|---|
| 4804 | Butterfly Girl | 8.7 | 180000.0 |
| 4801 | Children of Heaven | 8.5 | 180000.0 |
| 4706 | 12 Angry Men | 8.9 | 350000.0 |
| 4550 | A Separation | 8.4 | 500000.0 |
| 4636 | The Other Dream Team | 8.4 | 500000.0 |
sort_values()とdrop_duplicates()の組み合わせmovie2 = movie[['movie_title', 'title_year', 'imdb_score']]
movie2.sort_values('title_year', ascending=False).head()
| movie_title | title_year | imdb_score | |
|---|---|---|---|
| 3884 | The Veil | 2016.0 | 4.7 |
| 2375 | My Big Fat Greek Wedding 2 | 2016.0 | 6.1 |
| 2794 | Miracles from Heaven | 2016.0 | 6.8 |
| 92 | Independence Day: Resurgence | 2016.0 | 5.5 |
| 153 | Kung Fu Panda 3 | 2016.0 | 7.2 |
movie3 = movie2.sort_values(['title_year', 'imdb_score'], ascending=False)
movie3.head()
| movie_title | title_year | imdb_score | |
|---|---|---|---|
| 4312 | Kickboxer: Vengeance | 2016.0 | 9.1 |
| 4277 | A Beginner's Guide to Snuff | 2016.0 | 8.7 |
| 3798 | Airlift | 2016.0 | 8.5 |
| 27 | Captain America: Civil War | 2016.0 | 8.2 |
| 98 | Godzilla Resurgence | 2016.0 | 8.2 |
# title_yearが重複したものを初回分を除き削除(デフォルト:keep=top)
movie_top_year = movie3.drop_duplicates(subset='title_year')
movie_top_year.head(10)
| movie_title | title_year | imdb_score | |
|---|---|---|---|
| 4312 | Kickboxer: Vengeance | 2016.0 | 9.1 |
| 3745 | Running Forever | 2015.0 | 8.6 |
| 4369 | Queen of the Mountains | 2014.0 | 8.7 |
| 3935 | Batman: The Dark Knight Returns, Part 2 | 2013.0 | 8.4 |
| 3 | The Dark Knight Rises | 2012.0 | 8.5 |
| 3853 | Samsara | 2011.0 | 8.5 |
| 97 | Inception | 2010.0 | 8.8 |
| 67 | Up | 2009.0 | 8.3 |
| 66 | The Dark Knight | 2008.0 | 9.0 |
| 2646 | U2 3D | 2007.0 | 8.4 |
sort_values()でやるmovie = pd.read_csv(p / 'movie.csv')
movie2 = movie[['movie_title', 'imdb_score', 'budget']]
# 上位100本を表示
movie2.sort_values('imdb_score', ascending=False).head(100)
| movie_title | imdb_score | budget | |
|---|---|---|---|
| 2725 | Towering Inferno | 9.5 | NaN |
| 1920 | The Shawshank Redemption | 9.3 | 25000000.0 |
| 3402 | The Godfather | 9.2 | 6000000.0 |
| 2779 | Dekalog | 9.1 | NaN |
| 4312 | Kickboxer: Vengeance | 9.1 | 17000000.0 |
| ... | ... | ... | ... |
| 3799 | Anne of Green Gables | 8.4 | NaN |
| 3777 | Requiem for a Dream | 8.4 | 4500000.0 |
| 3935 | Batman: The Dark Knight Returns, Part 2 | 8.4 | 3500000.0 |
| 4636 | The Other Dream Team | 8.4 | 500000.0 |
| 2455 | Aliens | 8.4 | 18500000.0 |
100 rows × 3 columns
movie2.sort_values('imdb_score', ascending=False).head(100) \
.sort_values('budget').head()
| movie_title | imdb_score | budget | |
|---|---|---|---|
| 4815 | A Charlie Brown Christmas | 8.4 | 150000.0 |
| 4801 | Children of Heaven | 8.5 | 180000.0 |
| 4804 | Butterfly Girl | 8.7 | 180000.0 |
| 4706 | 12 Angry Men | 8.9 | 350000.0 |
| 4636 | The Other Dream Team | 8.4 | 500000.0 |