반응형
import numpy as np # Pandas is based on the Numpy
import pandas as pd
""" Pandas Objects """
#####----- Series Objects -----#####
s = pd.Series([0, 0.25, 0.5, 0.75, 1.0]) # Based on the List
# 0 0.00
# 1 0.25
# 2 0.50
# 3 0.75
# 4 1.00
# dtype: float64 # Store data with indexes.
print(s.index) # RangeIndex(start=0, stop=5, step=1) Check Index range.
print(s.values) # [0. 0.25 0.5 0.75 1. ] Check Values array.
print(s[1:4]) # Slicing ( partial extraction )
s = pd.Series([0, 0.25, 0.5, 0.75, 1.0],
index = ['a', 'b', 'c', 'd', 'e']) # Customize Index
# a 0.00
# b 0.25
# c 0.50
# d 0.75
# e 1.00
# dtype: float64
print(s['c']) # 0.5 // Extract value of index.
print(s[['c', 'd', 'e']]) # Extract particular indexes only
# c 0.50
# d 0.75
# e 1.00
# dtype: float64
print(s['b':]) # Slicing
# b 0.25
# c 0.50
# d 0.75
# e 1.00
# dtype: float64
print('b' in s) # True
print(s.unique()) # [0. 0.25 0.5 0.75 1. ]
print(s.value_counts()) # Number of each value.
# 0.00 1
# 0.25 1
# 0.50 1
# 0.75 1
# 1.00 1
# dtype: int64
print(s.isin([0.25, 0.75])) # Check the boolean about designated values.
# a False
# b True
# c False
# d True
# e False
# dtype: bool
pop_dic = {'Seoul' : 9720846, # Dictionary type
'Busan' : 343434,
'Incheon' : 35233,
'Daegu' : 45355,
'Daejun' : 553535}
population = pd.Series(pop_dic) # Convert dic-type for Series
print(population)
# Seoul 9720846
# Busan 343434
# Incheon 35233
# Daegu 45355
# Daejun 553535
# dtype: int64
print(population['Seoul']) # Indexing
print(population['Seoul':'Incheon']) # Slicing
# Seoul 9720846
# Busan 343434
# Incheon 35233
# dtype: int64
#####----- DataFrame Objects -----#####
a = pd.DataFrame([{'A' : 2, 'B' : 4, 'D' : 3}, {'A' : 4, 'B' : 5, 'C' : 7}]) #Dic
print(a) # Missing value auto fill**
# A B D C
# 0 2 4 3.0 NaN
# 1 4 5 NaN 7.0
b = pd.DataFrame(np.random.rand(5, 5),
columns=['A', 'B', 'C', 'D', 'E'],
index=[1, 2, 3, 4, 5])
print(b)
# A B C D E
# 1 0.226003 0.180817 0.593278 0.352600 0.545705
# 2 0.689239 0.937041 0.064205 0.449810 0.437920
# 3 0.884453 0.442023 0.607337 0.558680 0.510686
# 4 0.896234 0.068875 0.803716 0.419932 0.597673
# 5 0.883514 0.738278 0.489593 0.925169 0.588909
# Combination each Series** (population)
pop_dic = {'Seoul' : 9720846, # Dictionary type
'Busan' : 343434,
'Incheon' : 35233,
'Daegu' : 45355,
'Daejun' : 553535}
population = pd.Series(pop_dic)
male_dic = {'Seoul' : 12134, # Dictionary type // male population
'Busan' : 3434,
'Incheon' : 2424,
'Daegu' : 4355,
'Daejun' : 3535}
male = pd.Series(male_dic)
female_dic = {'Seoul' : 3412, # Dictionary type // female population
'Busan' : 5451,
'Incheon' : 2312,
'Daegu' : 2231,
'Daejun' : 7744}
female = pd.Series(female_dic)
korea_df = pd.DataFrame({'Population' : population,
'Male' : male,
'Female' : female})
print(korea_df)
# Population Male Female
# Seoul 9720846 12134 3412
# Busan 343434 3434 5451
# Incheon 35233 2424 2312
# Daegu 45355 4355 2231
# Daejun 553535 3535 7744
print(korea_df.index) # Index(['Seoul', 'Busan', 'Incheon', 'Daegu', 'Daejun'], dtype='object')
print(korea_df.columns) # Index(['Population', 'Male', 'Female'], dtype='object')
print(korea_df['Female'])
# Seoul 3412
# Busan 5451
# Incheon 2312
# Daegu 2231
# Daejun 7744
# Name: Female, dtype: int64
print(korea_df['Seoul':'Incheon']) # Slicing
# Population Male Female
# Seoul 9720846 12134 3412
# Busan 343434 3434 5451
# Incheon 35233 2424 2312
#####----- Index Objects -----#####
# Index : 일반적인 Index 객체, Numpy 배열 형식으로 축의 이름 표현
# Int64Index : 정수 값을 위한 Index
# MultiIndex : 단일 축에 여러 단계 색인을 표현하는 계층적 Index 객체 (튜플의 배열과 유사)
# DatetimeIndex : NumPy의 datetime64 타입으로 타임스탬프 저장
# PeriodIndex : 기간 데이터를 위한 Index
idx = pd.Index([2, 4, 6, 8, 10])
print(idx) # Int64Index([2, 4, 6, 8, 10], dtype='int64') Create Object
print(idx[1]) # 4 // Indexing
print(idx[1:2:2]) # Int64Index([4], dtype='int64') // Slicing
print(idx[-1::]) # Int64Index([10], dtype='int64') // ReSlicing
print(idx[::2]) # Int64Index([2, 6, 10], dtype='int64') # Step
print(idx) # Int64Index([2, 4, 6, 8, 10], dtype='int64')
print(idx.size) # 5
print(idx.shape) # (5,)
print(idx.ndim) # 1
print(idx.dtype) # int64
##### Index Operations #####
# append : 색인 객체를 추가한 새로운 색인 반환
# difference : 색인의 차집합 반환
# ( & ) intersection : 색인의 교집합 반환
# ( | ) union : 색인의 합집합 반환
# ( ^ ) symmetric_difference : 색인의 여집합 반환
# isin : 색인이 존재하는지 여부를 불리언 배열로 반환
# delete : 색인이 삭제된 새로운 색인 반환
# drop : 값이 삭제된 새로운 색인 반환
# insert : 색인이 추가되 새로운 색인 반환
# is_monotonic : 색인이 단조성을 가지면 True
# is_unique : 중복되는 색인이 없다면 True
# unique : 색인에서 중복되는 요소를 제거하고 유일한 값만 반환
idx1 = pd.Index([1, 2, 4, 6, 8])
idx2 = pd.Index([2, 4, 5, 6, 7])
print(idx1.append(idx2)) # Int64Index([1, 2, 4, 6, 8, 2, 4, 5, 6, 7], dtype='int64')
print(idx1.difference(idx2)) # Int64Index([1, 8], dtype='int64')
print(idx1.intersection(idx2)) # Int64Index([2, 4, 6], dtype='int64')
print(idx1 & idx2) # Int64Index([2, 4, 6], dtype='int64')
print(idx1.union(idx2)) # Int64Index([1, 2, 4, 5, 6, 7, 8], dtype='int64')
print(idx1 | idx2) # Int64Index([1, 2, 4, 5, 6, 7, 8], dtype='int64')
print(idx1.delete(0)) # Int64Index([2, 4, 6, 8], dtype='int64')
print(idx1.drop(1)) # Int64Index([2, 4, 6, 8], dtype='int64')
print(idx1.symmetric_difference(idx2)) # Int64Index([1, 5, 7, 8], dtype='int64')
print(idx1 ^ idx2) # Int64Index([1, 5, 7, 8], dtype='int64')
#####----- Details of Indexing -----#####
s = pd.Series([0, 0.25, 0.5, 0.75, 1.0],
index = ['a', 'b', 'c', 'd', 'e'])
print(s)
# a 0.00
# b 0.25
# c 0.50
# d 0.75
# e 1.00
# dtype: float64
print(s.keys()) # Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
print(s.items()) # <zip object at 0x000001BF92066680>
print(list(s.items())) # [('a', 0.0), ('b', 0.25), ('c', 0.5), ('d', 0.75), ('e', 1.0)]
s['f'] = 1.25
print(s)
# a 0.00
# b 0.25
# c 0.50
# d 0.75
# e 1.00
# f 1.25
# dtype: float64 # such as dictionary type, consist of Keys and Values.
print(s['a':'d']) # Extract particular values through Slicing
# a 0.00
# b 0.25
# c 0.50
# d 0.75
# dtype: float64
print(s[0:4]) # the same result above.
# a 0.00
# b 0.25
# c 0.50
# d 0.75
# dtype: float64
print(s[(s > 0.4) & (s < 0.8)]) # filtering option (Conditional Output)
# c 0.50
# d 0.75
# dtype: float64
print(s[['a', 'c', 'e']])
# a 0.0
# c 0.5
# e 1.0
# dtype: float64
##### Series Indexing #####
s = pd.Series(['a', 'b', 'c', 'd', 'e'],
index=[1, 3, 5, 7, 9])
print(s)
# 1 a
# 3 b
# 5 c
# 7 d
# 9 e
# dtype: object
print(s[2:4]) # Slicing
# 5 c
# 7 d
# dtype: object
print(s[1]) # a
print(s.iloc[1]) # b // ***Indexing method is different***
print(s.iloc[2:4]) # Slicing
# 5 c
# 7 d
# dtype: object
print(s.reindex(range(10))) # Reconfigure Index
# 0 NaN
# 1 a
# 2 NaN
# 3 b
# 4 NaN
# 5 c
# 6 NaN
# 7 d
# 8 NaN
# 9 e
# dtype: object
print(s.reindex(range(10), method='bfill')) # Reconfigure Index but fill Nan Back Value
# 0 a
# 1 a
# 2 b
# 3 b
# 4 c
# 5 c
# 6 d
# 7 d
# 8 e
# 9 e
# dtype: object
##### DataFrame Indexing #####
# df[val] : 하나의 Column 또는 여러 Column을 선택
# df.loc[val] : 라벨값으로 Row의 부분집합 선택
# df.loc[:, val] : 라벨값으로 Column의 부분집합 선택
# df.loc[val1, val2] : 라벨값으로 Row와 Column의 부분집합 선택
# df.iloc[where] : 정수 색인으로 Row의 부분집합 선택
# df.iloc[:, where] : 정수 색인으로 Column의 부분집합 선택
# df.iloc[where_i, where_j] : 정수 색인으로 Row와 Column의 부분집합 선택
# df.at[label_i, label_j] : Row와 Column의 라벨로 단일 값 선택
# df.iat[i, j] : Row와 Column의 정수 색인으로 단일 값 선택
# reindex : 하나 이상의 축을 새로운 색인으로 재색인
# get_value, set_value : Row와 Column의 이름으로 값 선택
#####-------------------------------------------------#####
pop_dic = {'Seoul' : 9720846, # Dictionary type
'Busan' : 343434,
'Incheon' : 35233,
'Daegu' : 45355,
'Daejun' : 553535}
population = pd.Series(pop_dic)
male_dic = {'Seoul' : 12134, # Dictionary type // male population
'Busan' : 3434,
'Incheon' : 2424,
'Daegu' : 4355,
'Daejun' : 3535}
male = pd.Series(male_dic)
female_dic = {'Seoul' : 3412, # Dictionary type // female population
'Busan' : 5451,
'Incheon' : 2312,
'Daegu' : 2231,
'Daejun' : 7744}
female = pd.Series(female_dic)
korea_df = pd.DataFrame({'Population' : population,
'Male' : male,
'Female' : female})
print(korea_df)
# Population Male Female
# Seoul 9720846 12134 3412
# Busan 343434 3434 5451
# Incheon 35233 2424 2312
# Daegu 45355 4355 2231
# Daejun 553535 3535 7744
#####-------------------------------------------------#####
print(korea_df['Male'])
# Seoul 12134
# Busan 3434
# Incheon 2424
# Daegu 4355
# Daejun 3535
# Name: Male, dtype: int64
print(korea_df.Male)
# Seoul 12134
# Busan 3434
# Incheon 2424
# Daegu 4355
# Daejun 3535
# Name: Male, dtype: int64
# Ratio of the Male to Female
korea_df['RatioMFE'] = (korea_df.Male * 100 / korea_df.Female)
print(korea_df.RatioMFE)
# Seoul 355.627198
# Busan 62.997615
# Incheon 104.844291
# Daegu 195.203944
# Daejun 45.648244
# Name: RatioMFE, dtype: float64
print(korea_df.values) # Extract only Values.
# [[9.72084600e+06 1.21340000e+04 3.41200000e+03 3.55627198e+02]
# [3.43434000e+05 3.43400000e+03 5.45100000e+03 6.29976151e+01]
# [3.52330000e+04 2.42400000e+03 2.31200000e+03 1.04844291e+02]
# [4.53550000e+04 4.35500000e+03 2.23100000e+03 1.95203944e+02]
# [5.53535000e+05 3.53500000e+03 7.74400000e+03 4.56482438e+01]]
print(korea_df.T) # Transpose (Change Rows and Column)
# Seoul Busan Incheon Daegu Daejun
# Population 9.720846e+06 343434.000000 35233.000000 45355.000000 553535.000000
# Male 1.213400e+04 3434.000000 2424.000000 4355.000000 3535.000000
# Female 3.412000e+03 5451.000000 2312.000000 2231.000000 7744.000000
# RatioMFE 3.556272e+02 62.997615 104.844291 195.203944 45.648244
print(korea_df.values[0]) # Extract particular values
# [9.72084600e+06 1.21340000e+04 3.41200000e+03 3.55627198e+02]
print(korea_df.loc[:'Incheon', :'Male']) # Slicing
# Population Male
# Seoul 9720846 12134
# Busan 343434 3434
# Incheon 35233 2424
print(korea_df.loc[(korea_df.Female > 3000)]) # Conditional Extraction
# Population Male Female RatioMFE
# Seoul 9720846 12134 3412 355.627198
# Busan 343434 3434 5451 62.997615
# Daejun 553535 3535 7744 45.648244
print(korea_df.loc[(korea_df.Population < 100000)])
# Population Male Female RatioMFE
# Incheon 35233 2424 2312 104.844291
# Daegu 45355 4355 2231 195.203944
print(korea_df.loc[korea_df.RatioMFE > 100])
# Population Male Female RatioMFE
# Seoul 9720846 12134 3412 355.627198
# Incheon 35233 2424 2312 104.844291
# Daegu 45355 4355 2231 195.203944
# Conditional Combination
print(korea_df.loc[(korea_df.Population > 25000) & (korea_df.RatioMFE > 100)])
# Population Male Female RatioMFE
# Seoul 9720846 12134 3412 355.627198
# Incheon 35233 2424 2312 104.844291
# Daegu 45355 4355 2231 195.203944
print(korea_df.iloc[:3, :2])
# Population Male
# Seoul 9720846 12134
# Busan 343434 3434
# Incheon 35233 2424
##### Multi Indexing #####
# 1차원의 Series와 2차원의 DataFrame 객체를 넘어 3차원, 4차원 이상의 고차원 데이터 처리
# 단일 인덱스 내에 여러 인덱스를 포함하는 다중 인덱싱
### Multi Indexing Series
#####-------------------------------------------------#####
pop_dic = {'Seoul' : 9720846, # Dictionary type
'Busan' : 343434,
'Incheon' : 35233,
'Daegu' : 45355,
'Daejun' : 553535}
population = pd.Series(pop_dic)
male_dic = {'Seoul' : 12134, # Dictionary type // male population
'Busan' : 3434,
'Incheon' : 2424,
'Daegu' : 4355,
'Daejun' : 3535}
male = pd.Series(male_dic)
female_dic = {'Seoul' : 3412, # Dictionary type // female population
'Busan' : 5451,
'Incheon' : 2312,
'Daegu' : 2231,
'Daejun' : 7744}
female = pd.Series(female_dic)
korea_df = pd.DataFrame({'Population' : population,
'Male' : male,
'Female' : female})
# Population Male Female
# Seoul 9720846 12134 3412
# Busan 343434 3434 5451
# Incheon 35233 2424 2312
# Daegu 45355 4355 2231
# Daejun 553535 3535 7744
#####-------------------------------------------------#####
# Create a multi Index*
idx_tuples = [('Seoul', 2010), ('Seoul', 2020),
('Busan', 2010), ('Busan', 2020),
('Incheon', 2010), ('Incheon', 2020),
('Daegu', 2010), ('Daegu', 2020),
('Daejun', 2010), ('Daejun', 2020)]
print(idx_tuples)
# [('Seoul', 2010), ('Seoul', 2020), ('Busan', 2010), ('Busan', 2020),
# ('Incheon', 2010), ('Incheon', 2020), ('Daegu', 2010), ('Daegu', 2020),
# ('Daejun', 2010), ('Daejun', 2020)]
pop_tuples = [10312545, 9720846,
2567910, 3404423,
2758296, 2947217,
2511676, 2427954,
1503664, 1471040]
print(pop_tuples)
# [10312545, 9720846, 2567910, 3404423, 2758296, 2947217, 2511676,
# 2427954, 1503664, 1471040]
population = pd.Series(pop_tuples, index=idx_tuples)
print(population)
# (Seoul, 2010) 10312545
# (Seoul, 2020) 9720846
# (Busan, 2010) 2567910
# (Busan, 2020) 3404423
# (Incheon, 2010) 2758296
# (Incheon, 2020) 2947217
# (Daegu, 2010) 2511676
# (Daegu, 2020) 2427954
# (Daejun, 2010) 1503664
# (Daejun, 2020) 1471040
# dtype: int64
midx = pd.MultiIndex.from_tuples(idx_tuples)
print(midx)
# MultiIndex([( 'Seoul', 2010),
# ( 'Seoul', 2020),
# ( 'Busan', 2010),
# ( 'Busan', 2020),
# ('Incheon', 2010),
# ('Incheon', 2020),
# ( 'Daegu', 2010),
# ( 'Daegu', 2020),
# ( 'Daejun', 2010),
# ( 'Daejun', 2020)],
# )
population = population.reindex(midx)
print(population)
# Seoul 2010 10312545
# 2020 9720846
# Busan 2010 2567910
# 2020 3404423
# Incheon 2010 2758296
# 2020 2947217
# Daegu 2010 2511676
# 2020 2427954
# Daejun 2010 1503664
# 2020 1471040
# dtype: int64
print(population[:, 2010]) # Slicing
# Seoul 10312545
# Busan 2567910
# Incheon 2758296
# Daegu 2511676
# Daejun 1503664
# dtype: int64
print(population['Daejun', :])
# 2010 1503664
# 2020 1471040
# dtype: int64
### Create Multi DataFrame Set
korea_mdf = population.unstack() # unstack (Convert DataFrame Structure)
# 2010 2020
# Busan 2567910 3404423
# Daegu 2511676 2427954
# Daejun 1503664 1471040
# Incheon 2758296 2947217
# Seoul 10312545 9720846
print(korea_mdf.stack()) # stack (Convert Multi Indexing Structure)
# Busan 2010 2567910
# 2020 3404423
# Daegu 2010 2511676
# 2020 2427954
# Daejun 2010 1503664
# 2020 1471040
# Incheon 2010 2758296
# 2020 2947217
# Seoul 2010 10312545
# 2020 9720846
# dtype: int64
male_tuples = [5111259, 4732275,
1773170, 1668618,
1390356, 1476813,
1255245, 1198815,
753648, 734441]
female_tuples = [5201286, 4988571,
1794740, 1735805,
1367940, 1470404,
1256431, 1229139,
750016, 736599]
korea_mdf = pd.DataFrame({'Total_Pop' : population,
'Male_Pop' : male_tuples,
'Female_pop' : female_tuples})
print(korea_mdf)
# Total Pop Male Pop Female_pop
# Seoul 2010 10312545 5111259 5201286
# 2020 9720846 4732275 4988571
# Busan 2010 2567910 1773170 1794740
# 2020 3404423 1668618 1735805
# Incheon 2010 2758296 1390356 1367940
# 2020 2947217 1476813 1470404
# Daegu 2010 2511676 1255245 1256431
# 2020 2427954 1198815 1229139
# Daejun 2010 1503664 753648 750016
# 2020 1471040 734441 736599
# Create ratio
ratio = korea_mdf['Male_Pop'] * 100 / korea_mdf['Female_pop']
print(ratio)
# Seoul 2010 98.269140
# 2020 94.862336
# Busan 2010 98.798155
# 2020 96.129346
# Incheon 2010 101.638668
# 2020 100.435867
# Daegu 2010 99.905606
# 2020 97.532907
# Daejun 2010 100.484256
# 2020 99.707032
# dtype: float64
print(ratio.unstack())
# 2010 2020
# Busan 98.798155 96.129346
# Daegu 99.905606 97.532907
# Daejun 100.484256 99.707032
# Incheon 101.638668 100.435867
# Seoul 98.269140 94.862336
korea_mdf = pd.DataFrame({'Total_Pop' : population,
'Male_Pop' : male_tuples,
'Female_pop' : female_tuples,
'MFE_Ratio' : ratio})
print(korea_mdf)
# Total_Pop Male_Pop Female_pop MFE_Ratio
# Seoul 2010 10312545 5111259 5201286 98.269140
# 2020 9720846 4732275 4988571 94.862336
# Busan 2010 2567910 1773170 1794740 98.798155
# 2020 3404423 1668618 1735805 96.129346
# Incheon 2010 2758296 1390356 1367940 101.638668
# 2020 2947217 1476813 1470404 100.435867
# Daegu 2010 2511676 1255245 1256431 99.905606
# 2020 2427954 1198815 1229139 97.532907
# Daejun 2010 1503664 753648 750016 100.484256
# 2020 1471040 734441 736599 99.707032
###### CREATE MULTI INDEX ######
df = pd.DataFrame(np.random.rand(6, 3),
index=[['a', 'a', 'b', 'b', 'c', 'c'], [1, 2, 1, 2, 1, 2]],
columns=['c1', 'c2', 'c3'])
print(df)
# c1 c2 c3
# a 1 0.014853 0.941698 0.271574
# 2 0.153064 0.844805 0.068737
# b 1 0.308630 0.152921 0.684782
# 2 0.433434 0.862060 0.493691
# c 1 0.091931 0.644179 0.773403
# 2 0.558342 0.721835 0.455065
print(pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b', 'c', 'c'], [1, 2, 1, 2, 1, 2]]))
# MultiIndex([('a', 1),
# ('a', 2),
# ('b', 1),
# ('b', 2),
# ('c', 1),
# ('c', 2)],
# )
print(pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2), ('c', 1), ('c', 2)]))
# MultiIndex([('a', 1),
# ('a', 2),
# ('b', 1),
# ('b', 2),
# ('c', 1),
# ('c', 2)],
# )
print(pd.MultiIndex.from_product([['a', 'b', 'c'], [1, 2]]))
# MultiIndex([('a', 1),
# ('a', 2),
# ('b', 1),
# ('b', 2),
# ('c', 1),
# ('c', 2)],
# )
print(pd.MultiIndex(levels=[['a', 'b', 'c'], [1, 2]],
codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]))
# MultiIndex([('a', 1),
# ('a', 2),
# ('b', 1),
# ('b', 2),
# ('c', 1),
# ('c', 2)],
# )
print(population)
# Seoul 2010 10312545
# 2020 9720846
# Busan 2010 2567910
# 2020 3404423
# Incheon 2010 2758296
# 2020 2947217
# Daegu 2010 2511676
# 2020 2427954
# Daejun 2010 1503664
# 2020 1471040
# dtype: int64
population.index.names = ['Adm.dstrct', 'Year']
print(population)
# Adm.dstrct Year
# Seoul 2010 10312545
# 2020 9720846
# Busan 2010 2567910
# 2020 3404423
# Incheon 2010 2758296
# 2020 2947217
# Daegu 2010 2511676
# 2020 2427954
# Daejun 2010 1503664
# 2020 1471040
# dtype: int64
idx = pd.MultiIndex.from_product([['a', 'b', 'c'], [1, 2]],
names=['name1', 'name2'])
cols = pd.MultiIndex.from_product([['c1', 'c2', 'c3'], [1, 2]],
names=(['col_name1', 'col_name2']))
data = np.round(np.random.randn(6, 6), 2)
mdf = pd.DataFrame(data, index=idx, columns=cols)
print(mdf)
# col_name1 c1 c2 c3
# col_name2 1 2 1 2 1 2
# name1 name2
# a 1 -1.31 -0.05 1.45 1.55 1.26 -0.26
# 2 -1.57 -0.14 -1.19 1.57 0.04 0.75
# b 1 0.19 -0.00 -0.57 -1.08 -0.48 -0.28
# 2 -0.81 -0.85 -0.06 0.57 -0.08 -0.69
# c 1 1.60 -1.06 -0.57 -0.27 -0.85 0.69
# 2 2.40 0.15 -1.13 0.25 -0.17 0.48
### INDEXING AND SLICING ###
print(mdf['c2'])
# col_name2 1 2
# name1 name2
# a 1 0.65 1.52
# 2 0.13 0.77
# b 1 -0.08 0.21
# 2 0.73 -1.20
# c 1 1.79 -0.02
# 2 -0.08 -0.63
print(population['Incheon', 2010]) # 2758296
print(population[:, 2010])
# Adm.dstrct
# Seoul 10312545
# Busan 2567910
# Incheon 2758296
# Daegu 2511676
# Daejun 1503664
# dtype: int64
print(population[population > 3000000])
# Adm.dstrct Year
# Seoul 2010 10312545
# 2020 9720846
# Busan 2020 3404423
# dtype: int64
print(population[['Daegu', 'Daejun']])
# Adm.dstrct Year
# Daegu 2010 2511676
# 2020 2427954
# Daejun 2010 1503664
# 2020 1471040
# dtype: int64
print(mdf['c2', 1])
# name1 name2
# a 1 0.92
# 2 -0.50
# b 1 1.13
# 2 1.17
# c 1 -0.32
# 2 -1.76
# Name: (c2, 1), dtype: float64
print(mdf.iloc[:3, :4])
# col_name1 c1 c2
# col_name2 1 2 1 2
# name1 name2
# a 1 0.18 1.39 0.86 -0.16
# 2 -0.42 0.82 -0.06 -0.03
# b 1 0.42 -0.45 -0.09 -0.34
print(mdf.loc[:, ('c2', 1)])
# name1 name2
# a 1 1.02
# 2 -1.92
# b 1 1.64
# 2 -0.44
# c 1 0.25
# 2 -0.57
# Name: (c2, 1), dtype: float64
idx_slice = pd.IndexSlice
print(mdf.loc[idx_slice[:,2], idx_slice[:, 2]])
# col_name1 c1 c2 c3
# col_name2 2 2 2
# name1 name2
# a 2 -0.84 1.88 -2.44
# b 2 -0.61 -0.78 0.12
# c 2 -0.29 0.55 0.77
##### REARRANGE MULTIPLE INDEXING #####
print(idx)
# MultiIndex([('a', 1),
# ('a', 2),
# ('b', 1),
# ('b', 2),
# ('c', 1),
# ('c', 2)],
# names=['name1', 'name2'])
print(korea_mdf) # Not yet sorted !!
# Total_Pop Male_Pop Female_pop MFE_Ratio
# Adm.dstrct Year
# Seoul 2010 10312545 5111259 5201286 98.269140
# 2020 9720846 4732275 4988571 94.862336
# Busan 2010 2567910 1773170 1794740 98.798155
# 2020 3404423 1668618 1735805 96.129346
# Incheon 2010 2758296 1390356 1367940 101.638668
# 2020 2947217 1476813 1470404 100.435867
# Daegu 2010 2511676 1255245 1256431 99.905606
# 2020 2427954 1198815 1229139 97.532907
# Daejun 2010 1503664 753648 750016 100.484256
# 2020 1471040 734441 736599 99.707032
korea_mdf = korea_mdf = korea_mdf.sort_index() # so Need to Sort !!
print(korea_mdf)
# Total_Pop Male_Pop Female_pop MFE_Ratio
# Adm.dstrct Year
# Busan 2010 2567910 1773170 1794740 98.798155
# 2020 3404423 1668618 1735805 96.129346
# Daegu 2010 2511676 1255245 1256431 99.905606
# 2020 2427954 1198815 1229139 97.532907
# Daejun 2010 1503664 753648 750016 100.484256
# 2020 1471040 734441 736599 99.707032
# Incheon 2010 2758296 1390356 1367940 101.638668
# 2020 2947217 1476813 1470404 100.435867
# Seoul 2010 10312545 5111259 5201286 98.269140
# 2020 9720846 4732275 4988571 94.862336
## Sort alphabetically, Can slice from now on !! ##
print(korea_mdf['Daegu' : 'Incheon'])
# Total_Pop Male_Pop Female_pop MFE_Ratio
# Adm.dstrct Year
# Daegu 2010 2511676 1255245 1256431 99.905606
# 2020 2427954 1198815 1229139 97.532907
# Daejun 2010 1503664 753648 750016 100.484256
# 2020 1471040 734441 736599 99.707032
# Incheon 2010 2758296 1390356 1367940 101.638668
# 2020 2947217 1476813 1470404 100.435867
print(korea_mdf.unstack(level=0)) # unstack level =0 -> Print Each Column
# Total_Pop ... MFE_Ratio
# Adm.dstrct Busan Daegu Daejun ... Daejun Incheon Seoul
# Year ...
# 2010 2567910 2511676 1503664 ... 100.484256 101.638668 98.269140
# 2020 3404423 2427954 1471040 ... 99.707032 100.435867 94.862336
# [2 rows x 20 columns]
print(korea_mdf.unstack(level=1))
# Total_Pop Male_Pop ... Female_pop MFE_Ratio
# Year 2010 2020 2010 ... 2020 2010 2020
# Adm.dstrct ...
# Busan 2567910 3404423 1773170 ... 1735805 98.798155 96.129346
# Daegu 2511676 2427954 1255245 ... 1229139 99.905606 97.532907
# Daejun 1503664 1471040 753648 ... 736599 100.484256 99.707032
# Incheon 2758296 2947217 1390356 ... 1470404 101.638668 100.435867
# Seoul 10312545 9720846 5111259 ... 4988571 98.269140 94.862336
# [5 rows x 8 columns]
print(korea_mdf.stack())
# Adm.dstrct Year
# Busan 2010 Total_Pop 2.567910e+06
# Male_Pop 1.773170e+06
# Female_pop 1.794740e+06
# MFE_Ratio 9.879815e+01
# 2020 Total_Pop 3.404423e+06
# Male_Pop 1.668618e+06
# Female_pop 1.735805e+06
# MFE_Ratio 9.612935e+01
# Daegu 2010 Total_Pop 2.511676e+06
# Male_Pop 1.255245e+06
# Female_pop 1.256431e+06
# MFE_Ratio 9.990561e+01
# 2020 Total_Pop 2.427954e+06
# Male_Pop 1.198815e+06
# Female_pop 1.229139e+06
# MFE_Ratio 9.753291e+01
# Daejun 2010 Total_Pop 1.503664e+06
# Male_Pop 7.536480e+05
# Female_pop 7.500160e+05
# MFE_Ratio 1.004843e+02
# 2020 Total_Pop 1.471040e+06
# Male_Pop 7.344410e+05
# Female_pop 7.365990e+05
# MFE_Ratio 9.970703e+01
# Incheon 2010 Total_Pop 2.758296e+06
# Male_Pop 1.390356e+06
# Female_pop 1.367940e+06
# MFE_Ratio 1.016387e+02
# 2020 Total_Pop 2.947217e+06
# Male_Pop 1.476813e+06
# Female_pop 1.470404e+06
# MFE_Ratio 1.004359e+02
# Seoul 2010 Total_Pop 1.031254e+07
# Male_Pop 5.111259e+06
# Female_pop 5.201286e+06
# MFE_Ratio 9.826914e+01
# 2020 Total_Pop 9.720846e+06
# Male_Pop 4.732275e+06
# Female_pop 4.988571e+06
# MFE_Ratio 9.486234e+01
# dtype: float64
print(korea_mdf) # It is Common Shape
# Total_Pop Male_Pop Female_pop MFE_Ratio
# Adm.dstrct Year
# Busan 2010 2567910 1773170 1794740 98.798155
# 2020 3404423 1668618 1735805 96.129346
# Daegu 2010 2511676 1255245 1256431 99.905606
# 2020 2427954 1198815 1229139 97.532907
# Daejun 2010 1503664 753648 750016 100.484256
# 2020 1471040 734441 736599 99.707032
# Incheon 2010 2758296 1390356 1367940 101.638668
# 2020 2947217 1476813 1470404 100.435867
# Seoul 2010 10312545 5111259 5201286 98.269140
# 2020 9720846 4732275 4988571 94.862336
idx_flat = korea_mdf.reset_index(level=0)
print(idx_flat)
# Adm.dstrct Total_Pop Male_Pop Female_pop MFE_Ratio
# Year
# 2010 Busan 2567910 1773170 1794740 98.798155
# 2020 Busan 3404423 1668618 1735805 96.129346
# 2010 Daegu 2511676 1255245 1256431 99.905606
# 2020 Daegu 2427954 1198815 1229139 97.532907
# 2010 Daejun 1503664 753648 750016 100.484256
# 2020 Daejun 1471040 734441 736599 99.707032
# 2010 Incheon 2758296 1390356 1367940 101.638668
# 2020 Incheon 2947217 1476813 1470404 100.435867
# 2010 Seoul 10312545 5111259 5201286 98.269140
# 2020 Seoul 9720846 4732275 4988571 94.862336
idx_flat = korea_mdf.reset_index(level=(0, 1))
print(idx_flat)
# Adm.dstrct Year Total_Pop Male_Pop Female_pop MFE_Ratio
# 0 Busan 2010 2567910 1773170 1794740 98.798155
# 1 Busan 2020 3404423 1668618 1735805 96.129346
# 2 Daegu 2010 2511676 1255245 1256431 99.905606
# 3 Daegu 2020 2427954 1198815 1229139 97.532907
# 4 Daejun 2010 1503664 753648 750016 100.484256
# 5 Daejun 2020 1471040 734441 736599 99.707032
# 6 Incheon 2010 2758296 1390356 1367940 101.638668
# 7 Incheon 2020 2947217 1476813 1470404 100.435867
# 8 Seoul 2010 10312545 5111259 5201286 98.269140
# 9 Seoul 2020 9720846 4732275 4988571 94.862336
print(idx_flat.set_index(['Adm.dstrct', 'Year']))
# Total_Pop Male_Pop Female_pop MFE_Ratio
# Adm.dstrct Year
# Busan 2010 2567910 1773170 1794740 98.798155
# 2020 3404423 1668618 1735805 96.129346
# Daegu 2010 2511676 1255245 1256431 99.905606
# 2020 2427954 1198815 1229139 97.532907
# Daejun 2010 1503664 753648 750016 100.484256
# 2020 1471040 734441 736599 99.707032
# Incheon 2010 2758296 1390356 1367940 101.638668
# 2020 2947217 1476813 1470404 100.435867
# Seoul 2010 10312545 5111259 5201286 98.269140
# 2020 9720846 4732275 4988571 94.862336
반응형
'Deep Learning' 카테고리의 다른 글
YOLO (0) | 2022.09.14 |
---|---|
Library_Pandas_2 (0) | 2022.09.14 |
Simply Perceptron Implement for TF (CSV) (0) | 2022.09.13 |
Implementing a Simple Neural Network Structure Using TensorFlow (0) | 2022.09.13 |
TensorFlow (0) | 2022.09.12 |