Library_Pandas_1

2022. 9. 14. 10:42

import numpy as np  # Pandas is based on the Numpy
import pandas as pd

""" Pandas Objects """

#####----- Series Objects -----#####

s = pd.Series([0, 0.25, 0.5, 0.75, 1.0])    # Based on the List
# 0    0.00
# 1    0.25
# 2    0.50
# 3    0.75
# 4    1.00
# dtype: float64        # Store data with indexes.

print(s.index)    # RangeIndex(start=0, stop=5, step=1) Check Index range.

print(s.values)   # [0.   0.25 0.5  0.75 1.  ] Check Values array.

print(s[1:4])     # Slicing ( partial extraction )

s = pd.Series([0, 0.25, 0.5, 0.75, 1.0],
              index = ['a', 'b', 'c', 'd', 'e']) # Customize Index
# a    0.00
# b    0.25
# c    0.50
# d    0.75
# e    1.00
# dtype: float64

print(s['c'])   # 0.5 // Extract value of index.

print(s[['c', 'd', 'e']])   # Extract particular indexes only
# c    0.50
# d    0.75
# e    1.00
# dtype: float64

print(s['b':])      # Slicing
# b    0.25
# c    0.50
# d    0.75
# e    1.00
# dtype: float64

print('b' in s) # True

print(s.unique())   # [0.   0.25 0.5  0.75 1.  ]

print(s.value_counts()) # Number of each value.
# 0.00    1
# 0.25    1
# 0.50    1
# 0.75    1
# 1.00    1
# dtype: int64

print(s.isin([0.25, 0.75])) # Check the boolean about designated values.
# a    False
# b     True
# c    False
# d     True
# e    False
# dtype: bool

pop_dic = {'Seoul' : 9720846,   # Dictionary type
           'Busan' : 343434,
           'Incheon' : 35233,
           'Daegu' : 45355,
           'Daejun' : 553535}

population = pd.Series(pop_dic) # Convert dic-type for Series
print(population)
# Seoul      9720846
# Busan       343434
# Incheon      35233
# Daegu        45355
# Daejun      553535
# dtype: int64

print(population['Seoul'])  # Indexing

print(population['Seoul':'Incheon'])    # Slicing
# Seoul      9720846
# Busan       343434
# Incheon      35233
# dtype: int64

#####----- DataFrame Objects -----#####

a = pd.DataFrame([{'A' : 2, 'B' : 4, 'D' : 3}, {'A' : 4, 'B' : 5, 'C' : 7}])    #Dic
print(a)    # Missing value auto fill**
#    A  B    D    C
# 0  2  4  3.0  NaN
# 1  4  5  NaN  7.0

b = pd.DataFrame(np.random.rand(5, 5),
                 columns=['A', 'B', 'C', 'D', 'E'],
                 index=[1, 2, 3, 4, 5])
print(b)
#           A         B         C         D         E
# 1  0.226003  0.180817  0.593278  0.352600  0.545705
# 2  0.689239  0.937041  0.064205  0.449810  0.437920
# 3  0.884453  0.442023  0.607337  0.558680  0.510686
# 4  0.896234  0.068875  0.803716  0.419932  0.597673
# 5  0.883514  0.738278  0.489593  0.925169  0.588909

# Combination each Series** (population)

pop_dic = {'Seoul' : 9720846,   # Dictionary type
           'Busan' : 343434,
           'Incheon' : 35233,
           'Daegu' : 45355,
           'Daejun' : 553535}
population = pd.Series(pop_dic)

male_dic = {'Seoul' : 12134,   # Dictionary type // male population
            'Busan' : 3434,
            'Incheon' : 2424,
            'Daegu' : 4355,
            'Daejun' : 3535}

male = pd.Series(male_dic)

female_dic = {'Seoul' : 3412,   # Dictionary type // female population
            'Busan' : 5451,
            'Incheon' : 2312,
            'Daegu' : 2231,
            'Daejun' : 7744}

female = pd.Series(female_dic)

korea_df = pd.DataFrame({'Population' : population,
                         'Male' : male,
                         'Female' : female})
print(korea_df)

#          Population   Male  Female
# Seoul       9720846  12134    3412
# Busan        343434   3434    5451
# Incheon       35233   2424    2312
# Daegu         45355   4355    2231
# Daejun       553535   3535    7744

print(korea_df.index)   # Index(['Seoul', 'Busan', 'Incheon', 'Daegu', 'Daejun'], dtype='object')

print(korea_df.columns) # Index(['Population', 'Male', 'Female'], dtype='object')

print(korea_df['Female'])
# Seoul      3412
# Busan      5451
# Incheon    2312
# Daegu      2231
# Daejun     7744
# Name: Female, dtype: int64

print(korea_df['Seoul':'Incheon'])  # Slicing
#          Population   Male  Female
# Seoul       9720846  12134    3412
# Busan        343434   3434    5451
# Incheon       35233   2424    2312

#####----- Index Objects -----#####

# Index : 일반적인 Index 객체, Numpy 배열 형식으로 축의 이름 표현
# Int64Index : 정수 값을 위한 Index
# MultiIndex : 단일 축에 여러 단계 색인을 표현하는 계층적 Index 객체 (튜플의 배열과 유사)
# DatetimeIndex : NumPy의 datetime64 타입으로 타임스탬프 저장
# PeriodIndex : 기간 데이터를 위한 Index

idx = pd.Index([2, 4, 6, 8, 10])
print(idx)  # Int64Index([2, 4, 6, 8, 10], dtype='int64') Create Object
print(idx[1])   # 4 // Indexing
print(idx[1:2:2])   # Int64Index([4], dtype='int64') // Slicing
print(idx[-1::])    # Int64Index([10], dtype='int64') // ReSlicing
print(idx[::2])     # Int64Index([2, 6, 10], dtype='int64') # Step

print(idx)          # Int64Index([2, 4, 6, 8, 10], dtype='int64')
print(idx.size)     # 5
print(idx.shape)    # (5,)
print(idx.ndim)     # 1
print(idx.dtype)    # int64

##### Index Operations #####


# append : 색인 객체를 추가한 새로운 색인 반환
# difference : 색인의 차집합 반환
# ( & ) intersection : 색인의 교집합 반환
# ( | ) union : 색인의 합집합 반환
# ( ^ ) symmetric_difference : 색인의 여집합 반환
# isin : 색인이 존재하는지 여부를 불리언 배열로 반환
# delete : 색인이 삭제된 새로운 색인 반환
# drop : 값이 삭제된 새로운 색인 반환
# insert : 색인이 추가되 새로운 색인 반환
# is_monotonic : 색인이 단조성을 가지면 True
# is_unique : 중복되는 색인이 없다면 True
# unique : 색인에서 중복되는 요소를 제거하고 유일한 값만 반환

idx1 = pd.Index([1, 2, 4, 6, 8])
idx2 = pd.Index([2, 4, 5, 6, 7])

print(idx1.append(idx2))    # Int64Index([1, 2, 4, 6, 8, 2, 4, 5, 6, 7], dtype='int64')
print(idx1.difference(idx2))    # Int64Index([1, 8], dtype='int64')
print(idx1.intersection(idx2))  # Int64Index([2, 4, 6], dtype='int64')
print(idx1 & idx2)              # Int64Index([2, 4, 6], dtype='int64')
print(idx1.union(idx2))         # Int64Index([1, 2, 4, 5, 6, 7, 8], dtype='int64')
print(idx1 | idx2)              # Int64Index([1, 2, 4, 5, 6, 7, 8], dtype='int64')
print(idx1.delete(0))           # Int64Index([2, 4, 6, 8], dtype='int64')
print(idx1.drop(1))             # Int64Index([2, 4, 6, 8], dtype='int64')
print(idx1.symmetric_difference(idx2))  # Int64Index([1, 5, 7, 8], dtype='int64')
print(idx1 ^ idx2)              # Int64Index([1, 5, 7, 8], dtype='int64')

#####----- Details of Indexing -----#####

s = pd.Series([0, 0.25, 0.5, 0.75, 1.0],
              index = ['a', 'b', 'c', 'd', 'e'])
print(s)
# a    0.00
# b    0.25
# c    0.50
# d    0.75
# e    1.00
# dtype: float64

print(s.keys()) # Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

print(s.items())    # <zip object at 0x000001BF92066680>
print(list(s.items()))  # [('a', 0.0), ('b', 0.25), ('c', 0.5), ('d', 0.75), ('e', 1.0)]

s['f'] = 1.25
print(s)
# a    0.00
# b    0.25
# c    0.50
# d    0.75
# e    1.00
# f    1.25
# dtype: float64    # such as dictionary type, consist of Keys and Values.

print(s['a':'d'])   # Extract particular values through Slicing
# a    0.00
# b    0.25
# c    0.50
# d    0.75
# dtype: float64

print(s[0:4])       # the same result above.
# a    0.00
# b    0.25
# c    0.50
# d    0.75
# dtype: float64

print(s[(s > 0.4) & (s < 0.8)]) # filtering option (Conditional Output)
# c    0.50
# d    0.75
# dtype: float64

print(s[['a', 'c', 'e']])
# a    0.0
# c    0.5
# e    1.0
# dtype: float64


##### Series Indexing #####

s = pd.Series(['a', 'b', 'c', 'd', 'e'],
              index=[1, 3, 5, 7, 9])
print(s)
# 1    a
# 3    b
# 5    c
# 7    d
# 9    e
# dtype: object

print(s[2:4])   # Slicing
# 5    c
# 7    d
# dtype: object

print(s[1]) # a
print(s.iloc[1])    # b // ***Indexing method is different***

print(s.iloc[2:4])  # Slicing
# 5    c
# 7    d
# dtype: object

print(s.reindex(range(10))) # Reconfigure Index
# 0    NaN
# 1      a
# 2    NaN
# 3      b
# 4    NaN
# 5      c
# 6    NaN
# 7      d
# 8    NaN
# 9      e
# dtype: object

print(s.reindex(range(10), method='bfill')) # Reconfigure Index but fill Nan Back Value
# 0    a
# 1    a
# 2    b
# 3    b
# 4    c
# 5    c
# 6    d
# 7    d
# 8    e
# 9    e
# dtype: object

##### DataFrame Indexing #####

# df[val] : 하나의 Column 또는 여러 Column을 선택
# df.loc[val] : 라벨값으로 Row의 부분집합 선택
# df.loc[:, val] : 라벨값으로 Column의 부분집합 선택
# df.loc[val1, val2] : 라벨값으로 Row와 Column의 부분집합 선택
# df.iloc[where] : 정수 색인으로 Row의 부분집합 선택
# df.iloc[:, where] : 정수 색인으로 Column의 부분집합 선택
# df.iloc[where_i, where_j] : 정수 색인으로 Row와 Column의 부분집합 선택
# df.at[label_i, label_j] : Row와 Column의 라벨로 단일 값 선택
# df.iat[i, j] : Row와 Column의 정수 색인으로 단일 값 선택
# reindex : 하나 이상의 축을 새로운 색인으로 재색인
# get_value, set_value : Row와 Column의 이름으로 값 선택

#####-------------------------------------------------#####
pop_dic = {'Seoul' : 9720846,   # Dictionary type
           'Busan' : 343434,
           'Incheon' : 35233,
           'Daegu' : 45355,
           'Daejun' : 553535}
population = pd.Series(pop_dic)

male_dic = {'Seoul' : 12134,   # Dictionary type // male population
            'Busan' : 3434,
            'Incheon' : 2424,
            'Daegu' : 4355,
            'Daejun' : 3535}

male = pd.Series(male_dic)

female_dic = {'Seoul' : 3412,   # Dictionary type // female population
            'Busan' : 5451,
            'Incheon' : 2312,
            'Daegu' : 2231,
            'Daejun' : 7744}

female = pd.Series(female_dic)

korea_df = pd.DataFrame({'Population' : population,
                         'Male' : male,
                         'Female' : female})
print(korea_df)
#          Population   Male  Female
# Seoul       9720846  12134    3412
# Busan        343434   3434    5451
# Incheon       35233   2424    2312
# Daegu         45355   4355    2231
# Daejun       553535   3535    7744
#####-------------------------------------------------#####

print(korea_df['Male'])
# Seoul      12134
# Busan       3434
# Incheon     2424
# Daegu       4355
# Daejun      3535
# Name: Male, dtype: int64

print(korea_df.Male)
# Seoul      12134
# Busan       3434
# Incheon     2424
# Daegu       4355
# Daejun      3535
# Name: Male, dtype: int64

# Ratio of the Male to Female
korea_df['RatioMFE'] = (korea_df.Male * 100 / korea_df.Female)
print(korea_df.RatioMFE)
# Seoul      355.627198
# Busan       62.997615
# Incheon    104.844291
# Daegu      195.203944
# Daejun      45.648244
# Name: RatioMFE, dtype: float64

print(korea_df.values)  # Extract only Values.
# [[9.72084600e+06 1.21340000e+04 3.41200000e+03 3.55627198e+02]
#  [3.43434000e+05 3.43400000e+03 5.45100000e+03 6.29976151e+01]
#  [3.52330000e+04 2.42400000e+03 2.31200000e+03 1.04844291e+02]
#  [4.53550000e+04 4.35500000e+03 2.23100000e+03 1.95203944e+02]
#  [5.53535000e+05 3.53500000e+03 7.74400000e+03 4.56482438e+01]]

print(korea_df.T) # Transpose (Change Rows and Column)
#                    Seoul          Busan       Incheon         Daegu         Daejun   
# Population  9.720846e+06  343434.000000  35233.000000  45355.000000  553535.000000   
# Male        1.213400e+04    3434.000000   2424.000000   4355.000000    3535.000000   
# Female      3.412000e+03    5451.000000   2312.000000   2231.000000    7744.000000   
# RatioMFE    3.556272e+02      62.997615    104.844291    195.203944      45.648244 

print(korea_df.values[0])   # Extract particular values
# [9.72084600e+06 1.21340000e+04 3.41200000e+03 3.55627198e+02]

print(korea_df.loc[:'Incheon', :'Male'])    # Slicing
#          Population   Male
# Seoul       9720846  12134
# Busan        343434   3434
# Incheon       35233   2424

print(korea_df.loc[(korea_df.Female > 3000)])   # Conditional Extraction
#         Population   Male  Female    RatioMFE
# Seoul      9720846  12134    3412  355.627198
# Busan       343434   3434    5451   62.997615
# Daejun      553535   3535    7744   45.648244

print(korea_df.loc[(korea_df.Population < 100000)])
#          Population  Male  Female    RatioMFE
# Incheon       35233  2424    2312  104.844291
# Daegu         45355  4355    2231  195.203944

print(korea_df.loc[korea_df.RatioMFE > 100])
#          Population   Male  Female    RatioMFE
# Seoul       9720846  12134    3412  355.627198
# Incheon       35233   2424    2312  104.844291
# Daegu         45355   4355    2231  195.203944

# Conditional Combination
print(korea_df.loc[(korea_df.Population > 25000) & (korea_df.RatioMFE > 100)])
#          Population   Male  Female    RatioMFE
# Seoul       9720846  12134    3412  355.627198
# Incheon       35233   2424    2312  104.844291
# Daegu         45355   4355    2231  195.203944

print(korea_df.iloc[:3, :2])
#          Population   Male
# Seoul       9720846  12134
# Busan        343434   3434
# Incheon       35233   2424

##### Multi Indexing #####

# 1차원의 Series와 2차원의 DataFrame 객체를 넘어 3차원, 4차원 이상의 고차원 데이터 처리
# 단일 인덱스 내에 여러 인덱스를 포함하는 다중 인덱싱

### Multi Indexing Series

#####-------------------------------------------------#####
pop_dic = {'Seoul' : 9720846,   # Dictionary type
           'Busan' : 343434,
           'Incheon' : 35233,
           'Daegu' : 45355,
           'Daejun' : 553535}
population = pd.Series(pop_dic)

male_dic = {'Seoul' : 12134,   # Dictionary type // male population
            'Busan' : 3434,
            'Incheon' : 2424,
            'Daegu' : 4355,
            'Daejun' : 3535}

male = pd.Series(male_dic)

female_dic = {'Seoul' : 3412,   # Dictionary type // female population
            'Busan' : 5451,
            'Incheon' : 2312,
            'Daegu' : 2231,
            'Daejun' : 7744}

female = pd.Series(female_dic)

korea_df = pd.DataFrame({'Population' : population,
                         'Male' : male,
                         'Female' : female})

#          Population   Male  Female
# Seoul       9720846  12134    3412
# Busan        343434   3434    5451
# Incheon       35233   2424    2312
# Daegu         45355   4355    2231
# Daejun       553535   3535    7744
#####-------------------------------------------------#####

# Create a multi Index*
idx_tuples = [('Seoul', 2010), ('Seoul', 2020),
              ('Busan', 2010), ('Busan', 2020),
              ('Incheon', 2010), ('Incheon', 2020),
              ('Daegu', 2010), ('Daegu', 2020),
              ('Daejun', 2010), ('Daejun', 2020)]

print(idx_tuples)
# [('Seoul', 2010), ('Seoul', 2020), ('Busan', 2010), ('Busan', 2020), 
#  ('Incheon', 2010), ('Incheon', 2020), ('Daegu', 2010), ('Daegu', 2020), 
#  ('Daejun', 2010), ('Daejun', 2020)]

pop_tuples = [10312545, 9720846,
              2567910, 3404423,
              2758296, 2947217,
              2511676, 2427954,
              1503664, 1471040]

print(pop_tuples)
# [10312545, 9720846, 2567910, 3404423, 2758296, 2947217, 2511676, 
#  2427954, 1503664, 1471040]

population = pd.Series(pop_tuples, index=idx_tuples)
print(population)
# (Seoul, 2010)      10312545
# (Seoul, 2020)       9720846
# (Busan, 2010)       2567910
# (Busan, 2020)       3404423
# (Incheon, 2010)     2758296
# (Incheon, 2020)     2947217
# (Daegu, 2010)       2511676
# (Daegu, 2020)       2427954
# (Daejun, 2010)      1503664
# (Daejun, 2020)      1471040
# dtype: int64

midx = pd.MultiIndex.from_tuples(idx_tuples)
print(midx)
# MultiIndex([(  'Seoul', 2010),
#             (  'Seoul', 2020),
#             (  'Busan', 2010),
#             (  'Busan', 2020),
#             ('Incheon', 2010),
#             ('Incheon', 2020),
#             (  'Daegu', 2010),
#             (  'Daegu', 2020),
#             ( 'Daejun', 2010),
#             ( 'Daejun', 2020)],
#            )

population = population.reindex(midx)
print(population)
# Seoul    2010    10312545
#          2020     9720846
# Busan    2010     2567910
#          2020     3404423
# Incheon  2010     2758296
#          2020     2947217
# Daegu    2010     2511676
#          2020     2427954
# Daejun   2010     1503664
#          2020     1471040
# dtype: int64

print(population[:, 2010])  # Slicing
# Seoul      10312545
# Busan       2567910
# Incheon     2758296
# Daegu       2511676
# Daejun      1503664
# dtype: int64

print(population['Daejun', :])
# 2010    1503664
# 2020    1471040
# dtype: int64

### Create Multi DataFrame Set

korea_mdf = population.unstack()    # unstack (Convert DataFrame Structure)
#              2010     2020
# Busan     2567910  3404423
# Daegu     2511676  2427954
# Daejun    1503664  1471040
# Incheon   2758296  2947217
# Seoul    10312545  9720846

print(korea_mdf.stack())  # stack (Convert Multi Indexing Structure)
# Busan    2010     2567910
#          2020     3404423
# Daegu    2010     2511676
#          2020     2427954
# Daejun   2010     1503664
#          2020     1471040
# Incheon  2010     2758296
#          2020     2947217
# Seoul    2010    10312545
#          2020     9720846
# dtype: int64

male_tuples = [5111259, 4732275,
              1773170, 1668618,
              1390356, 1476813,
              1255245, 1198815,
              753648, 734441]

female_tuples = [5201286, 4988571,
                  1794740, 1735805,
                  1367940, 1470404,
                  1256431, 1229139,
                  750016, 736599]

korea_mdf = pd.DataFrame({'Total_Pop' : population,
                          'Male_Pop' : male_tuples,
                          'Female_pop' : female_tuples})

print(korea_mdf)
#               Total Pop  Male Pop  Female_pop
# Seoul   2010   10312545   5111259     5201286
#         2020    9720846   4732275     4988571
# Busan   2010    2567910   1773170     1794740
#         2020    3404423   1668618     1735805
# Incheon 2010    2758296   1390356     1367940
#         2020    2947217   1476813     1470404
# Daegu   2010    2511676   1255245     1256431
#         2020    2427954   1198815     1229139
# Daejun  2010    1503664    753648      750016
#         2020    1471040    734441      736599

# Create ratio
ratio = korea_mdf['Male_Pop'] * 100 / korea_mdf['Female_pop']
print(ratio)
# Seoul    2010     98.269140
#          2020     94.862336
# Busan    2010     98.798155
#          2020     96.129346
# Incheon  2010    101.638668
#          2020    100.435867
# Daegu    2010     99.905606
#          2020     97.532907
# Daejun   2010    100.484256
#          2020     99.707032
# dtype: float64

print(ratio.unstack())
#                2010        2020
# Busan     98.798155   96.129346
# Daegu     99.905606   97.532907
# Daejun   100.484256   99.707032
# Incheon  101.638668  100.435867
# Seoul     98.269140   94.862336

korea_mdf = pd.DataFrame({'Total_Pop' : population,
                          'Male_Pop' : male_tuples,
                          'Female_pop' : female_tuples,
                          'MFE_Ratio' : ratio})

print(korea_mdf)
#               Total_Pop  Male_Pop  Female_pop   MFE_Ratio
# Seoul   2010   10312545   5111259     5201286   98.269140
#         2020    9720846   4732275     4988571   94.862336
# Busan   2010    2567910   1773170     1794740   98.798155
#         2020    3404423   1668618     1735805   96.129346
# Incheon 2010    2758296   1390356     1367940  101.638668
#         2020    2947217   1476813     1470404  100.435867
# Daegu   2010    2511676   1255245     1256431   99.905606
#         2020    2427954   1198815     1229139   97.532907
# Daejun  2010    1503664    753648      750016  100.484256
#         2020    1471040    734441      736599   99.707032

###### CREATE MULTI INDEX ######

df = pd.DataFrame(np.random.rand(6, 3),
                  index=[['a', 'a', 'b', 'b', 'c', 'c'], [1, 2, 1, 2, 1, 2]],
                  columns=['c1', 'c2', 'c3'])
print(df)
#            c1        c2        c3
# a 1  0.014853  0.941698  0.271574
#   2  0.153064  0.844805  0.068737
# b 1  0.308630  0.152921  0.684782
#   2  0.433434  0.862060  0.493691
# c 1  0.091931  0.644179  0.773403
#   2  0.558342  0.721835  0.455065

print(pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b', 'c', 'c'], [1, 2, 1, 2, 1, 2]]))
# MultiIndex([('a', 1),
#             ('a', 2),
#             ('b', 1),
#             ('b', 2),
#             ('c', 1),
#             ('c', 2)],
#            )

print(pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2), ('c', 1), ('c', 2)]))
# MultiIndex([('a', 1),
#             ('a', 2),
#             ('b', 1),
#             ('b', 2),
#             ('c', 1),
#             ('c', 2)],
#            )

print(pd.MultiIndex.from_product([['a', 'b', 'c'], [1, 2]]))
# MultiIndex([('a', 1),
#             ('a', 2),
#             ('b', 1),
#             ('b', 2),
#             ('c', 1),
#             ('c', 2)],
#            )

print(pd.MultiIndex(levels=[['a', 'b', 'c'], [1, 2]],
                    codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]))
# MultiIndex([('a', 1),
#             ('a', 2),
#             ('b', 1),
#             ('b', 2),
#             ('c', 1),
#             ('c', 2)],
#            )

print(population)
# Seoul    2010    10312545
#          2020     9720846
# Busan    2010     2567910
#          2020     3404423
# Incheon  2010     2758296
#          2020     2947217
# Daegu    2010     2511676
#          2020     2427954
# Daejun   2010     1503664
#          2020     1471040
# dtype: int64

population.index.names = ['Adm.dstrct', 'Year']
print(population)
# Adm.dstrct  Year
# Seoul       2010    10312545
#             2020     9720846
# Busan       2010     2567910
#             2020     3404423
# Incheon     2010     2758296
#             2020     2947217
# Daegu       2010     2511676
#             2020     2427954
# Daejun      2010     1503664
#             2020     1471040
# dtype: int64

idx = pd.MultiIndex.from_product([['a', 'b', 'c'], [1, 2]],
                                 names=['name1', 'name2'])
cols = pd.MultiIndex.from_product([['c1', 'c2', 'c3'], [1, 2]],
                                  names=(['col_name1', 'col_name2']))
 
data = np.round(np.random.randn(6, 6), 2)
mdf = pd.DataFrame(data, index=idx, columns=cols)
print(mdf)
# col_name1      c1          c2          c3      
# col_name2       1     2     1     2     1     2
# name1 name2
# a     1     -1.31 -0.05  1.45  1.55  1.26 -0.26
#       2     -1.57 -0.14 -1.19  1.57  0.04  0.75
# b     1      0.19 -0.00 -0.57 -1.08 -0.48 -0.28
#       2     -0.81 -0.85 -0.06  0.57 -0.08 -0.69
# c     1      1.60 -1.06 -0.57 -0.27 -0.85  0.69
#       2      2.40  0.15 -1.13  0.25 -0.17  0.48

### INDEXING AND SLICING ###

print(mdf['c2'])
# col_name2       1     2
# name1 name2
# a     1      0.65  1.52
#       2      0.13  0.77
# b     1     -0.08  0.21
#       2      0.73 -1.20
# c     1      1.79 -0.02
#       2     -0.08 -0.63

print(population['Incheon', 2010])  # 2758296

print(population[:, 2010])
# Adm.dstrct
# Seoul      10312545
# Busan       2567910
# Incheon     2758296
# Daegu       2511676
# Daejun      1503664
# dtype: int64

print(population[population > 3000000])
# Adm.dstrct  Year
# Seoul       2010    10312545
#             2020     9720846
# Busan       2020     3404423
# dtype: int64

print(population[['Daegu', 'Daejun']])
# Adm.dstrct  Year
# Daegu       2010    2511676
#             2020    2427954
# Daejun      2010    1503664
#             2020    1471040
# dtype: int64

print(mdf['c2', 1])
# name1  name2
# a      1        0.92
#        2       -0.50
# b      1        1.13
#        2        1.17
# c      1       -0.32
#        2       -1.76
# Name: (c2, 1), dtype: float64

print(mdf.iloc[:3, :4])
# col_name1      c1          c2      
# col_name2       1     2     1     2
# name1 name2
# a     1      0.18  1.39  0.86 -0.16
#       2     -0.42  0.82 -0.06 -0.03
# b     1      0.42 -0.45 -0.09 -0.34

print(mdf.loc[:, ('c2', 1)])
# name1  name2
# a      1        1.02
#        2       -1.92
# b      1        1.64
#        2       -0.44
# c      1        0.25
#        2       -0.57
# Name: (c2, 1), dtype: float64

idx_slice = pd.IndexSlice
print(mdf.loc[idx_slice[:,2], idx_slice[:, 2]])
# col_name1      c1    c2    c3
# col_name2       2     2     2
# name1 name2
# a     2     -0.84  1.88 -2.44
# b     2     -0.61 -0.78  0.12
# c     2     -0.29  0.55  0.77


##### REARRANGE MULTIPLE INDEXING #####

print(idx)
# MultiIndex([('a', 1),
#             ('a', 2),
#             ('b', 1),
#             ('b', 2),
#             ('c', 1),
#             ('c', 2)],
#            names=['name1', 'name2'])

print(korea_mdf)    # Not yet sorted !!
#                  Total_Pop  Male_Pop  Female_pop   MFE_Ratio
# Adm.dstrct Year
# Seoul      2010   10312545   5111259     5201286   98.269140
#            2020    9720846   4732275     4988571   94.862336
# Busan      2010    2567910   1773170     1794740   98.798155
#            2020    3404423   1668618     1735805   96.129346
# Incheon    2010    2758296   1390356     1367940  101.638668
#            2020    2947217   1476813     1470404  100.435867
# Daegu      2010    2511676   1255245     1256431   99.905606
#            2020    2427954   1198815     1229139   97.532907
# Daejun     2010    1503664    753648      750016  100.484256
#            2020    1471040    734441      736599   99.707032

korea_mdf = korea_mdf = korea_mdf.sort_index()  # so Need to Sort !!
print(korea_mdf)
#                  Total_Pop  Male_Pop  Female_pop   MFE_Ratio
# Adm.dstrct Year
# Busan      2010    2567910   1773170     1794740   98.798155
#            2020    3404423   1668618     1735805   96.129346
# Daegu      2010    2511676   1255245     1256431   99.905606
#            2020    2427954   1198815     1229139   97.532907
# Daejun     2010    1503664    753648      750016  100.484256
#            2020    1471040    734441      736599   99.707032
# Incheon    2010    2758296   1390356     1367940  101.638668
#            2020    2947217   1476813     1470404  100.435867
# Seoul      2010   10312545   5111259     5201286   98.269140
#            2020    9720846   4732275     4988571   94.862336

## Sort alphabetically, Can slice from now on !! ##
print(korea_mdf['Daegu' : 'Incheon'])
#                  Total_Pop  Male_Pop  Female_pop   MFE_Ratio
# Adm.dstrct Year
# Daegu      2010    2511676   1255245     1256431   99.905606
#            2020    2427954   1198815     1229139   97.532907
# Daejun     2010    1503664    753648      750016  100.484256
#            2020    1471040    734441      736599   99.707032
# Incheon    2010    2758296   1390356     1367940  101.638668
#            2020    2947217   1476813     1470404  100.435867

print(korea_mdf.unstack(level=0))   # unstack level =0 -> Print Each Column
#            Total_Pop                    ...   MFE_Ratio
# Adm.dstrct     Busan    Daegu   Daejun  ...      Daejun     Incheon      Seoul        
# Year                                    ...
# 2010         2567910  2511676  1503664  ...  100.484256  101.638668  98.269140        
# 2020         3404423  2427954  1471040  ...   99.707032  100.435867  94.862336

# [2 rows x 20 columns]

print(korea_mdf.unstack(level=1))
#            Total_Pop          Male_Pop  ... Female_pop   MFE_Ratio
# Year            2010     2020     2010  ...       2020        2010        2020        
# Adm.dstrct                              ...
# Busan        2567910  3404423  1773170  ...    1735805   98.798155   96.129346        
# Daegu        2511676  2427954  1255245  ...    1229139   99.905606   97.532907        
# Daejun       1503664  1471040   753648  ...     736599  100.484256   99.707032        
# Incheon      2758296  2947217  1390356  ...    1470404  101.638668  100.435867        
# Seoul       10312545  9720846  5111259  ...    4988571   98.269140   94.862336        

# [5 rows x 8 columns]

print(korea_mdf.stack())
# Adm.dstrct  Year
# Busan       2010  Total_Pop     2.567910e+06
#                   Male_Pop      1.773170e+06
#                   Female_pop    1.794740e+06
#                   MFE_Ratio     9.879815e+01
#             2020  Total_Pop     3.404423e+06
#                   Male_Pop      1.668618e+06
#                   Female_pop    1.735805e+06
#                   MFE_Ratio     9.612935e+01
# Daegu       2010  Total_Pop     2.511676e+06
#                   Male_Pop      1.255245e+06
#                   Female_pop    1.256431e+06
#                   MFE_Ratio     9.990561e+01
#             2020  Total_Pop     2.427954e+06
#                   Male_Pop      1.198815e+06
#                   Female_pop    1.229139e+06
#                   MFE_Ratio     9.753291e+01
# Daejun      2010  Total_Pop     1.503664e+06
#                   Male_Pop      7.536480e+05
#                   Female_pop    7.500160e+05
#                   MFE_Ratio     1.004843e+02
#             2020  Total_Pop     1.471040e+06
#                   Male_Pop      7.344410e+05
#                   Female_pop    7.365990e+05
#                   MFE_Ratio     9.970703e+01
# Incheon     2010  Total_Pop     2.758296e+06
#                   Male_Pop      1.390356e+06
#                   Female_pop    1.367940e+06
#                   MFE_Ratio     1.016387e+02
#             2020  Total_Pop     2.947217e+06
#                   Male_Pop      1.476813e+06
#                   Female_pop    1.470404e+06
#                   MFE_Ratio     1.004359e+02
# Seoul       2010  Total_Pop     1.031254e+07
#                   Male_Pop      5.111259e+06
#                   Female_pop    5.201286e+06
#                   MFE_Ratio     9.826914e+01
#             2020  Total_Pop     9.720846e+06
#                   Male_Pop      4.732275e+06
#                   Female_pop    4.988571e+06
#                   MFE_Ratio     9.486234e+01
# dtype: float64

print(korea_mdf)    # It is Common Shape
#                  Total_Pop  Male_Pop  Female_pop   MFE_Ratio
# Adm.dstrct Year
# Busan      2010    2567910   1773170     1794740   98.798155
#            2020    3404423   1668618     1735805   96.129346
# Daegu      2010    2511676   1255245     1256431   99.905606
#            2020    2427954   1198815     1229139   97.532907
# Daejun     2010    1503664    753648      750016  100.484256
#            2020    1471040    734441      736599   99.707032
# Incheon    2010    2758296   1390356     1367940  101.638668
#            2020    2947217   1476813     1470404  100.435867
# Seoul      2010   10312545   5111259     5201286   98.269140
#            2020    9720846   4732275     4988571   94.862336

idx_flat = korea_mdf.reset_index(level=0)
print(idx_flat)
#      Adm.dstrct  Total_Pop  Male_Pop  Female_pop   MFE_Ratio
# Year
# 2010      Busan    2567910   1773170     1794740   98.798155
# 2020      Busan    3404423   1668618     1735805   96.129346
# 2010      Daegu    2511676   1255245     1256431   99.905606
# 2020      Daegu    2427954   1198815     1229139   97.532907
# 2010     Daejun    1503664    753648      750016  100.484256
# 2020     Daejun    1471040    734441      736599   99.707032
# 2010    Incheon    2758296   1390356     1367940  101.638668
# 2020    Incheon    2947217   1476813     1470404  100.435867
# 2010      Seoul   10312545   5111259     5201286   98.269140
# 2020      Seoul    9720846   4732275     4988571   94.862336

idx_flat = korea_mdf.reset_index(level=(0, 1))
print(idx_flat)
#   Adm.dstrct  Year  Total_Pop  Male_Pop  Female_pop   MFE_Ratio
# 0      Busan  2010    2567910   1773170     1794740   98.798155
# 1      Busan  2020    3404423   1668618     1735805   96.129346
# 2      Daegu  2010    2511676   1255245     1256431   99.905606
# 3      Daegu  2020    2427954   1198815     1229139   97.532907
# 4     Daejun  2010    1503664    753648      750016  100.484256
# 5     Daejun  2020    1471040    734441      736599   99.707032
# 6    Incheon  2010    2758296   1390356     1367940  101.638668
# 7    Incheon  2020    2947217   1476813     1470404  100.435867
# 8      Seoul  2010   10312545   5111259     5201286   98.269140
# 9      Seoul  2020    9720846   4732275     4988571   94.862336

print(idx_flat.set_index(['Adm.dstrct', 'Year']))
#                  Total_Pop  Male_Pop  Female_pop   MFE_Ratio
# Adm.dstrct Year
# Busan      2010    2567910   1773170     1794740   98.798155
#            2020    3404423   1668618     1735805   96.129346
# Daegu      2010    2511676   1255245     1256431   99.905606
#            2020    2427954   1198815     1229139   97.532907
# Daejun     2010    1503664    753648      750016  100.484256
#            2020    1471040    734441      736599   99.707032
# Incheon    2010    2758296   1390356     1367940  101.638668
#            2020    2947217   1476813     1470404  100.435867
# Seoul      2010   10312545   5111259     5201286   98.269140
#            2020    9720846   4732275     4988571   94.862336

저작자표시 (새창열림)

'Deep Learning' 카테고리의 다른 글

YOLO (0)	2022.09.14
Library_Pandas_2 (0)	2022.09.14
Simply Perceptron Implement for TF (CSV) (0)	2022.09.13
Implementing a Simple Neural Network Structure Using TensorFlow (0)	2022.09.13
TensorFlow (0)	2022.09.12

내 블로그 - 관리자 홈 전환	`Q` `Q`
새 글 쓰기	`W` `W`

글 수정 (권한 있는 경우)	`E` `E`
댓글 영역으로 이동	`C` `C`

이 페이지의 URL 복사	`S` `S`
맨 위로 이동	`T` `T`
티스토리 홈 이동	`H` `H`
단축키 안내	`Shift` + `/` `⇧` + `/`

-. --- - .

Library_Pandas_1

'Deep Learning' 카테고리의 다른 글

+ Recent posts

티스토리툴바

단축키

내 블로그

블로그 게시글

모든 영역