본문 바로가기
통계프로그램 비교 시리즈/데이터 전처리 비교

[데이터 관리] 12. 변수를 관측치로 전치후 원상태로 복구

by 기서무나구물 2022. 1. 15.

포스팅 목차

    [데이터 관리] 12. 변수를 관측치로 전치후 원상태로 복구

    * Reshaping Variables to Observations and Back

     


    1. Proc SQL

     

     

     


    2. SAS Programming

    • 데이터 전치를 위한 SAS 프로그램.
    • 길이가 넓은 포맷을 긴 포맷으로 전치.;
    SAS Programming
    data BACK.mydata;
         infile 'c:\data\mydata.csv' delimiter = ','
                MISSOVER DSD lrecl=32767 firstobs=2 ;
         input id workshop gender $ q1 q2 q3 q4;
    run;

     

     

    • 배열을 이용한 데이터 전치;
    SAS Programming
    DATA mylong;
     SET BACK.mydata;
         ARRAY q{4} q1-q4;
         DO i=1 to 4;
            y=q{i};
            question=i;
            output;
         END;
         KEEP id workshop gender question y;
    RUN;
    
    PROC PRINT; RUN;

     

    Results
    OBS    id    workshop    gender    y    question
      1     1        1         f       1        1
      2     1        1         f       1        2
      3     1        1         f       5        3
      4     1        1         f       1        4
      5     2        2         f       2        1
      6     2        2         f       1        2
      7     2        2         f       4        3
      8     2        2         f       1        4
      9     3        1         f       2        1
     10     3        1         f       2        2
     11     3        1         f       4        3
     12     3        1         f       3        4
     13     4        2         f       3        1
     14     4        2         f       1        2
     15     4        2         f       .        3
     16     4        2         f       3        4
     17     5        1         m       4        1
     18     5        1         m       5        2
     19     5        1         m       2        3
     20     5        1         m       4        4
     21     6        2         m       5        1
     22     6        2         m       4        2
     23     6        2         m       5        3
     24     6        2         m       5        4
     25     7        1         m       5        1
     26     7        1         m       3        2
     27     7        1         m       4        3
     28     7        1         m       4        4
     29     8        2         m       4        1
     30     8        2         m       5        2
     31     8        2         m       5        3
     32     8        2         m       5        4

     

    • Transpose 프로시져를 사용한 데이터 전치

    Transpose 프로시져를 사용하여 데이터를 전치한다.

    SAS Programming
    proc transpose data=BACK.mydata
                     out =TRA_DAT(rename=(col1=y))
                     name=question;
         by id workshop gender;
         var q1-q4;
    run;
    
    PROC PRINT; RUN;

     

    Results
    OBS    id    workshop    gender    question    y
      1     1        1         f          q1       1
      2     1        1         f          q2       1
      3     1        1         f          q3       5
      4     1        1         f          q4       1
      5     2        2         f          q1       2
      6     2        2         f          q2       1
      7     2        2         f          q3       4
      8     2        2         f          q4       1
      9     3        1         f          q1       2
     10     3        1         f          q2       2
     11     3        1         f          q3       4
     12     3        1         f          q4       3
     13     4        2         f          q1       3
     14     4        2         f          q2       1
     15     4        2         f          q3       .
     16     4        2         f          q4       3
     17     5        1         m          q1       4
     18     5        1         m          q2       5
     19     5        1         m          q3       2
     20     5        1         m          q4       4
     21     6        2         m          q1       5
     22     6        2         m          q2       4
     23     6        2         m          q3       5
     24     6        2         m          q4       5
     25     7        1         m          q1       5
     26     7        1         m          q2       3
     27     7        1         m          q3       4
     28     7        1         m          q4       4
     29     8        2         m          q1       4
     30     8        2         m          q2       5
     31     8        2         m          q3       5
     32     8        2         m          q4       5

     

     

    • 긴 포맷을 넓은 포맷으로 원상 복귀;
    SAS Programming
    PROC SORT DATA=mylong;
         BY id question;
    RUN;
    
    
    DATA mywide;
     SET mylong;
         BY id;
         RETAIN q1-q4;
         ARRAY q{*} q1-q4;
    
         IF FIRST.id THEN do;
            i=1;
            DO j=1 to 4;
               q{i}=.;
            END;
         end;
    
         q{i}=y;
         IF LAST.id THEN OUTPUT;
         i+1;
         DROP question y i j;
    
    RUN;
    
    PROC PRINT; RUN;

     

    Results
    OBS    id    workshop    gender    q1    q2    q3    q4
     1      1        1         f        1     1     5     1
     2      2        2         f        2     1     4     1
     3      3        1         f        2     2     4     3
     4      4        2         f        3     1     .     3
     5      5        1         m        4     5     2     4
     6      6        2         m        5     4     5     5
     7      7        1         m        5     3     4     4
     8      8        2         m        4     5     5     5

     

     

    • 긴 포맷을 넓은 포맷으로 원상 복귀;
    SAS Programming
    proc transpose data=TRA_DAT
                     out =TRA_DAT1(drop=_NAME_);
         by id workshop gender;
         var y;
         id question;
    run;
    
    PROC PRINT; RUN;

     

    Results
    OBS    id    workshop    gender    q1    q2    q3    q4
     1      1        1         f        1     1     5     1
     2      2        2         f        2     1     4     1
     3      3        1         f        2     2     4     3
     4      4        2         f        3     1     .     3
     5      5        1         m        4     5     2     4
     6      6        2         m        5     4     5     5
     7      7        1         m        5     3     4     4
     8      8        2         m        4     5     5     5

     

     


    3. SPSS

    • 넓은 포맷을 긴 포맷으로 전치;
    SPSS Programming
    GET FILE='C:\mydata.sav'.
    VARSTOCASES  /MAKE Y FROM q1 q2 q3 q4
     /INDEX = Question(4)
     /KEEP =  id workshop gender
     /NULL = KEEP.
    SAVE OUTFILE='C:\data\mywide.sav'.
    EXECUTE.

     

     

    • 긴 포맷을 넓은 포맷으로 전치;
    SPSS Programming
    GET FILE='C:\data\mywide.sav'.
    CASESTOVARS
     /ID = id workshop gender
     /INDEX = Question
     /GROUPBY = VARIABLE.
    SAVE OUTFILE='C:\data\mylong.sav'.
    EXECUTE.

     

     


    4. R Programming (R-PROJECT)

     

    R Programming
    from rpy2.robjects import r
    %load_ext rpy2.ipython

     

    Results
    The rpy2.ipython extension is already loaded. To reload it, use:
      %reload_ext rpy2.ipython

     

     

    R Programming
    %%R
    
    library(tidyverse)
    library(psych)
    
    library(reshape)
    library(reshape2)
    
    
    mydata <- read_csv("C:/work/data/mydata.csv", 
      col_types = cols( id       = col_double(),
                        workshop = col_character(),
                        gender   = col_character(),
                        q1       = col_double(),
                        q2       = col_double(),
                        q3       = col_double(),
                        q4       = col_double()
      )
    )
    
    withmooc = mydata
    
    attach(withmooc) # mydata를 기본 데이터 세트로 지정.
    
    withmooc

     

    Results
    R[write to console]: The following objects are masked from withmooc (pos = 3):
    
        gender, id, q1, q2, q3, q4, workshop
    
    
    
    
    # A tibble: 8 x 7
         id workshop gender    q1    q2    q3    q4
      <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
    1     1 1        f          1     1     5     1
    2     2 2        f          2     1     4     1
    3     3 1        f          2     2     4     3
    4     4 2        f          3     1    NA     3
    5     5 1        m          4     5     2     4
    6     6 2        m          5     4     5     5
    7     7 1        m          5     3     4     4
    8     8 2        m          4     5     5     5

     

     

    • 이 예제를 위하여 ID변수가 필요.
    • rownames를 이용하여 ID변수를 생성.
    R Programming
    %%R
    
    withmooc$subject <- as.numeric( rownames(withmooc) )
    
    withmooc

     

    Results
    # A tibble: 8 x 8
         id workshop gender    q1    q2    q3    q4 subject
      <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>   <dbl>
    1     1 1        f          1     1     5     1       1
    2     2 2        f          2     1     4     1       2
    3     3 1        f          2     2     4     3       3
    4     4 2        f          3     1    NA     3       4
    5     5 1        m          4     5     2     4       5
    6     6 2        m          5     4     5     5       6
    7     7 1        m          5     3     4     4       7
    8     8 2        m          4     5     5     5       8

     

     

    • 또는 아래와 같이 생성을 할수 있다.
    R Programming
    %%R
    
    withmooc$subject <- 1:8
    
    print(withmooc)

     

    Results
    # A tibble: 8 x 8
         id workshop gender    q1    q2    q3    q4 subject
      <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>   <int>
    1     1 1        f          1     1     5     1       1
    2     2 2        f          2     1     4     1       2
    3     3 1        f          2     2     4     3       3
    4     4 2        f          3     1    NA     3       4
    5     5 1        m          4     5     2     4       5
    6     6 2        m          5     4     5     5       6
    7     7 1        m          5     3     4     4       7
    8     8 2        m          4     5     5     5       8

     

     

    • wide to long : 데이터 전치를 위하여 Reshape패키지의 melt함수 사용.
    R Programming
    %%R
    
    library(reshape2)
    mylong<-reshape2::melt(data = withmooc,id.vars=c("subject","workshop","gender"),measure.vars = c("q1", "q2","q3","q4"))
    
    head(mylong,10)

     

    Results
       subject workshop gender variable value
    1        1        1      f       q1     1
    2        2        2      f       q1     2
    3        3        1      f       q1     2
    4        4        2      f       q1     3
    5        5        1      m       q1     4
    6        6        2      m       q1     5
    7        7        1      m       q1     5
    8        8        2      m       q1     4
    9        1        1      f       q2     1
    10       2        2      f       q2     1

     

     

    • long to wide : Cast함수를 사용하여 세로 데이터를 가로 데이터로 전치.
    • Value 변수 값이 변수들에 대한 측정값이 된다.
    R Programming
    %%R
    
    mywide<-reshape::cast(mylong, subject+workshop+gender~variable)
    
    print(mywide)

     

    Results
      subject workshop gender q1 q2 q3 q4
    1       1        1      f  1  1  5  1
    2       2        2      f  2  1  4  1
    3       3        1      f  2  2  4  3
    4       4        2      f  3  1 NA  3
    5       5        1      m  4  5  2  4
    6       6        2      m  5  4  5  5
    7       7        1      m  5  3  4  4
    8       8        2      m  4  5  5  5

     

     


    5. R - Tidyverse

     

    R Programming
    from rpy2.robjects import r
    %load_ext rpy2.ipython

     

    Results
    The rpy2.ipython extension is already loaded. To reload it, use:
      %reload_ext rpy2.ipython

     

     

    R Programming
    %%R
    
    library(tidyverse)
    library(psych)
    mydata <- read_csv("C:/work/data/mydata.csv", 
      col_types = cols( id       = col_double(),
                        workshop = col_character(),
                        gender   = col_character(),
                        q1       = col_double(),
                        q2       = col_double(),
                        q3       = col_double(),
                        q4       = col_double()
      )
    )
    
    withmooc = mydata
    
    attach(withmooc) # mydata를 기본 데이터 세트로 지정.
    
    withmooc

     

    Results
    R[write to console]: The following objects are masked from withmooc (pos = 3):
    
    R[write to console]: The following objects are masked from withmooc (pos = 5):
    
        gender, id, q1, q2, q3, q4, workshop
    
    
    
    
    # A tibble: 8 x 7
         id workshop gender    q1    q2    q3    q4
      <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
    1     1 1        f          1     1     5     1
    2     2 2        f          2     1     4     1
    3     3 1        f          2     2     4     3
    4     4 2        f          3     1    NA     3
    5     5 1        m          4     5     2     4
    6     6 2        m          5     4     5     5
    7     7 1        m          5     3     4     4
    8     8 2        m          4     5     5     5

     

     

    R Programming
    %%R
    
    withmooc <- withmooc %>%
      mutate(subject = rownames(mydata))
    
    withmooc

     

    Results
    # A tibble: 8 x 8
         id workshop gender    q1    q2    q3    q4 subject
      <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl> <chr>  
    1     1 1        f          1     1     5     1 1      
    2     2 2        f          2     1     4     1 2      
    3     3 1        f          2     2     4     3 3      
    4     4 2        f          3     1    NA     3 4      
    5     5 1        m          4     5     2     4 5      
    6     6 2        m          5     4     5     5 6      
    7     7 1        m          5     3     4     4 7      
    8     8 2        m          4     5     5     5 8      

     

     

    • wide to long : 데이터 전치를 위하여 gather함수 사용.
    R Programming
    %%R
    
    mylong <- withmooc %>% 
      dplyr::select(-id) %>%
      gather(key='variable', value='value', -subject,-workshop,-gender) %>%
      relocate(subject,workshop) 
    
    mylong

     

    Results
    # A tibble: 32 x 5
       subject workshop gender variable value
       <chr>   <chr>    <chr>  <chr>    <dbl>
     1 1       1        f      q1           1
     2 2       2        f      q1           2
     3 3       1        f      q1           2
     4 4       2        f      q1           3
     5 5       1        m      q1           4
     6 6       2        m      q1           5
     7 7       1        m      q1           5
     8 8       2        m      q1           4
     9 1       1        f      q2           1
    10 2       2        f      q2           1
    # ... with 22 more rows

     

     

    • wide to long : gather(), spread() 함수가 개선된 pivot_longer(), pivot_wider() 함수는 다른 패키지의 최신 기능이 추가됨.
    R Programming
    %%R
    
    mylong <- withmooc %>% 
      dplyr::select(-id) %>%
      pivot_longer( c(q1,q2,q3,q4), names_to='variable', values_to='value') %>%
      relocate(subject,workshop) 
    
    mylong

     

    Results
    # A tibble: 32 x 5
       subject workshop gender variable value
       <chr>   <chr>    <chr>  <chr>    <dbl>
     1 1       1        f      q1           1
     2 1       1        f      q2           1
     3 1       1        f      q3           5
     4 1       1        f      q4           1
     5 2       2        f      q1           2
     6 2       2        f      q2           1
     7 2       2        f      q3           4
     8 2       2        f      q4           1
     9 3       1        f      q1           2
    10 3       1        f      q2           2
    # ... with 22 more rows

     

     

    R Programming
    %%R
    
    mylong <- withmooc %>% 
      dplyr::select(-id) %>%
      pivot_longer( -c(subject,workshop,gender), names_to='variable', values_to='value') %>%
      relocate(subject,workshop) 
    
    mylong

     

    Results
    # A tibble: 32 x 5
       subject workshop gender variable value
       <chr>   <chr>    <chr>  <chr>    <dbl>
     1 1       1        f      q1           1
     2 1       1        f      q2           1
     3 1       1        f      q3           5
     4 1       1        f      q4           1
     5 2       2        f      q1           2
     6 2       2        f      q2           1
     7 2       2        f      q3           4
     8 2       2        f      q4           1
     9 3       1        f      q1           2
    10 3       1        f      q2           2
    # ... with 22 more rows

     

     

    • long to wide : spread / pivot_wider함수를 사용하여 세로 데이터를 가로 데이터로 전치.
    • Value 변수 값이 변수들에 대한 측정값이 된다.
    R Programming
    %%R
    
    mywide<- mylong %>% 
      spread(key='variable', value='value')
    
    mywide

     

    Results
    # A tibble: 8 x 7
      subject workshop gender    q1    q2    q3    q4
      <chr>   <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
    1 1       1        f          1     1     5     1
    2 2       2        f          2     1     4     1
    3 3       1        f          2     2     4     3
    4 4       2        f          3     1    NA     3
    5 5       1        m          4     5     2     4
    6 6       2        m          5     4     5     5
    7 7       1        m          5     3     4     4
    8 8       2        m          4     5     5     5

     

     

    R Programming
    %%R
    
    mywide <- mylong %>%
      pivot_wider(names_from='variable', values_from='value')
    
    mywide

     

    Results
    # A tibble: 8 x 7
      subject workshop gender    q1    q2    q3    q4
      <chr>   <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
    1 1       1        f          1     1     5     1
    2 2       2        f          2     1     4     1
    3 3       1        f          2     2     4     3
    4 4       2        f          3     1    NA     3
    5 5       1        m          4     5     2     4
    6 6       2        m          5     4     5     5
    7 7       1        m          5     3     4     4
    8 8       2        m          4     5     5     5

     

     


    6. Python - Pandas

     

    Python Programming
    import pandas as pd
    import numpy as np
    import sweetviz as sv
    
    mydata = pd.read_csv("C:/work/data/mydata.csv",sep=",",
                         dtype={'id':object,'workshop':object,
                                'q1':int, 'q2':int, 'q3':float, 'q4':int},
                         na_values=['NaN'],skipinitialspace =True)
    
    withmooc= mydata.copy()
    
    withmooc

     

    Results
    	id	workshop	gender	q1	q2	q3	q4
    0	1	1		f	1	1	5.0	1
    1	2	2		f	2	1	4.0	1
    2	3	1		f	2	2	4.0	3
    3	4	2		f	3	1	NaN	3
    4	5	1		m	4	5	2.0	4
    5	6	2		m	5	4	5.0	5
    6	7	1		m	5	3	4.0	4
    7	8	2		m	4	5	5.0	5

     

     

    • 이 예제를 위하여 ID변수가 필요.
    • rownames를 이용하여 ID변수를 생성.
    Python Programming
    withmooc['subject'] = withmooc.index
    
    withmooc

     

    Results
    	id	workshop	gender	q1	q2	q3	q4	subject
    0	1	1		f	1	1	5.0	1	0
    1	2	2		f	2	1	4.0	1	1
    2	3	1		f	2	2	4.0	3	2
    3	4	2		f	3	1	NaN	3	3
    4	5	1		m	4	5	2.0	4	4
    5	6	2		m	5	4	5.0	5	5
    6	7	1		m	5	3	4.0	4	6
    7	8	2		m	4	5	5.0	5	7

     

     

    Python Programming
     
    print(withmooc.shape)
    print(withmooc.dtypes)
    withmooc.index

     

    Results
    (8, 8)
    id           object
    workshop     object
    gender       object
    q1            int32
    q2            int32
    q3          float64
    q4            int32
    subject       int64
    dtype: object
    
    
    
    
    
    RangeIndex(start=0, stop=8, step=1)

     

     

    • 또는 아래와 같이 생성을 할수 있다.
    Python Programming
    withmooc= mydata.copy()
    withmooc
    
    withmooc['subject'] = range(0,8)
    
    withmooc

     

    Results
    	id	workshop	gender	q1	q2	q3	q4	subject
    0	1	1		f	1	1	5.0	1	0
    1	2	2		f	2	1	4.0	1	1
    2	3	1		f	2	2	4.0	3	2
    3	4	2		f	3	1	NaN	3	3
    4	5	1		m	4	5	2.0	4	4
    5	6	2		m	5	4	5.0	5	5
    6	7	1		m	5	3	4.0	4	6
    7	8	2		m	4	5	5.0	5	7

     


    wide to long : melt

    • 데이터 전치를 위하여 melt함수 사용.
    Python Programming
    pd.melt(withmooc.iloc[:,1:], id_vars=["subject","workshop","gender"]).head(10)

     

    Results
    	subject	workshop	gender	variable	value
    0	0	1		f	q1		1.0
    1	1	2		f	q1		2.0
    2	2	1		f	q1		2.0
    3	3	2		f	q1		3.0
    4	4	1		m	q1		4.0
    5	5	2		m	q1		5.0
    6	6	1		m	q1		5.0
    7	7	2		m	q1		4.0
    8	0	1		f	q2		1.0
    9	1	2		f	q2		1.0

     

     

    Python Programming
     
    pd.melt(withmooc, id_vars=["subject","workshop","gender"], value_vars=['q1', 'q2','q3','q4']).head(10)

     

    Results
    	subject	workshop	gender	variable	value
    0	0	1		f	q1		1.0
    1	1	2		f	q1		2.0
    2	2	1		f	q1		2.0
    3	3	2		f	q1		3.0
    4	4	1		m	q1		4.0
    5	5	2		m	q1		5.0
    6	6	1		m	q1		5.0
    7	7	2		m	q1		4.0
    8	0	1		f	q2		1.0
    9	1	2		f	q2		1.0

     

     

    Python Programming
     
    pd.concat([pd.melt(withmooc, id_vars=["subject","workshop","gender"], value_vars=val, 
                                 var_name='subject', value_name='value') for val in ['q1','q2','q3','q4']]).reset_index().head(10)

     

    Results
    	index	subject	workshop	gender	subject	value
    0	0	q1	1		f	q1	1.0
    1	1	q1	2		f	q1	2.0
    2	2	q1	1		f	q1	2.0
    3	3	q1	2		f	q1	3.0
    4	4	q1	1		m	q1	4.0
    5	5	q1	2		m	q1	5.0
    6	6	q1	1		m	q1	5.0
    7	7	q1	2		m	q1	4.0
    8	0	q2	1		f	q2	1.0
    9	1	q2	2		f	q2	1.0

     


    * wide to long : stack

    • stack함수를 사용하여 가로 데이터를 세로 데이터로 전치.
    • Value 변수 값이 변수들에 대한 측정값이 된다.
    Python Programming
    withmooc= mydata.copy()
    withmooc
    
    withmooc['subject'] = range(0,8)
    
    withmooc = withmooc.iloc[:,1:].set_index(["subject","workshop","gender"])
    print(withmooc)
    
    stack_long = withmooc.stack(dropna=False).reset_index(name='value').rename(columns={'level_3':'variable'})
    
    stack_long.head(10)

     

    Results
                             q1  q2   q3  q4
    subject workshop gender                 
    0       1        f        1   1  5.0   1
    1       2        f        2   1  4.0   1
    2       1        f        2   2  4.0   3
    3       2        f        3   1  NaN   3
    4       1        m        4   5  2.0   4
    5       2        m        5   4  5.0   5
    6       1        m        5   3  4.0   4
    7       2        m        4   5  5.0   5

     

    Results
    	subject	workshop	gender	variable	value
    0	0	1		f	q1		1.0
    1	0	1		f	q2		1.0
    2	0	1		f	q3		5.0
    3	0	1		f	q4		1.0
    4	1	2		f	q1		2.0
    5	1	2		f	q2		1.0
    6	1	2		f	q3		4.0
    7	1	2		f	q4		1.0
    8	2	1		f	q1		2.0
    9	2	1		f	q2		2.0

     


    * wide to long : wide_to_long()

     

    Python Programming
    withmooc= mydata.copy()
    withmooc['subject'] = range(0,8)
    
    pd.wide_to_long(withmooc, ["q"], i=["subject","workshop","gender"], j="variable").drop(columns = "id").reset_index().rename(columns={'q': 'value'}).head(10)

     

    Results
    	subject	workshop	gender	variable	value
    0	0	1		f	1	1.0
    1	0	1		f	2	1.0
    2	0	1		f	3	5.0
    3	0	1		f	4	1.0
    4	1	2		f	1	2.0
    5	1	2		f	2	1.0
    6	1	2		f	3	4.0
    7	1	2		f	4	1.0
    8	2	1		f	1	2.0
    9	2	1		f	2	2.0

     


    * wide to long : pivot_table & unstack()

     

    Python Programming
    withmooc= mydata.copy()
    
    withmooc['subject'] = range(0,8)
    
    pivot_long = pd.pivot_table(withmooc, columns=["subject","workshop","gender"]).unstack().reset_index(name='value').rename(columns={'level_3': 'variable'})
    
    pivot_long.head(10)

     

    Results
    	subject	workshop	gender	variable	value
    0	0	1		f	q1	1.0
    1	0	1		f	q2	1.0
    2	0	1		f	q3	5.0
    3	0	1		f	q4	1.0
    4	1	2		f	q1	2.0
    5	1	2		f	q2	1.0
    6	1	2		f	q3	4.0
    7	1	2		f	q4	1.0
    8	2	1		f	q1	2.0
    9	2	1		f	q2	2.0

     


    * long to wide : pivot_table

    • Cast함수를 사용하여 세로 데이터를 가로 데이터로 전치.
    • Value 변수 값이 변수들에 대한 측정값이 된다.
    Python Programming
    pivot_long.pivot_table(index=["subject","workshop","gender"], 
                           columns='variable', 
                           values='value').reset_index()

     

    Results
    variable	subject	workshop	gender	q1	q2	q3	q4
    0		0	1		f	1.0	1.0	5.0	1.0
    1		1	2		f	2.0	1.0	4.0	1.0
    2		2	1		f	2.0	2.0	4.0	3.0
    3		3	2		f	3.0	1.0	NaN	3.0
    4		4	1		m	4.0	5.0	2.0	4.0
    5		5	2		m	5.0	4.0	5.0	5.0
    6		6	1		m	5.0	3.0	4.0	4.0
    7		7	2		m	4.0	5.0	5.0	5.0

     


    * long to wide : pivot

     

    Python Programming
    pivot_long.pivot(index=["subject","workshop","gender"], 
                           columns='variable', 
                           values='value').reset_index()

     

    Results
    variable	subject	workshop	gender	q1	q2	q3	q4
    0		0	1		f	1.0	1.0	5.0	1.0
    1		1	2		f	2.0	1.0	4.0	1.0
    2		2	1		f	2.0	2.0	4.0	3.0
    3		3	2		f	3.0	1.0	NaN	3.0
    4		4	1		m	4.0	5.0	2.0	4.0
    5		5	2		m	5.0	4.0	5.0	5.0
    6		6	1		m	5.0	3.0	4.0	4.0
    7		7	2		m	4.0	5.0	5.0	5.0

     


    * long to wide : unstack

    • NaN 값 유지를 위하여 agg 형태 사용
    Python Programming
    stack_long.groupby(["subject","workshop","gender","variable"]).agg({'value': lambda x: x.mean(skipna=False)}).unstack().reset_index()

     

    Results
    		subject	workshop	gender	value
    variable				q1	q2	q3	q4
    0		0	1		f	1.0	1.0	5.0	1.0
    1		1	2		f	2.0	1.0	4.0	1.0
    2		2	1		f	2.0	2.0	4.0	3.0
    3		3	2		f	3.0	1.0	NaN	3.0
    4		4	1		m	4.0	5.0	2.0	4.0
    5		5	2		m	5.0	4.0	5.0	5.0
    6		6	1		m	5.0	3.0	4.0	4.0
    7		7	2		m	4.0	5.0	5.0	5.0

     

     

    Python Programming
     
    unstack_wide = stack_long.set_index(["subject","workshop","gender","variable"]).unstack().reset_index()
    unstack_wide

     

    Results
    		subject	workshop	gender	value
    variable					q1	q2	q3	q4
    0		0	1		f	1.0	1.0	5.0	1.0
    1		1	2		f	2.0	1.0	4.0	1.0
    2		2	1		f	2.0	2.0	4.0	3.0
    3		3	2		f	3.0	1.0	NaN	3.0
    4		4	1		m	4.0	5.0	2.0	4.0
    5		5	2		m	5.0	4.0	5.0	5.0
    6		6	1		m	5.0	3.0	4.0	4.0
    7		7	2		m	4.0	5.0	5.0	5.0

     

     

    Python Programming
    unstack_wide = stack_long.set_index(["subject","workshop","gender","variable"]).unstack().reset_index()
    
    unstack_wide.columns = unstack_wide.columns.map('{0[0]}{0[1]}'.format)
    
    # unstack_wide.columns = unstack_wide.columns.droplevel()
    
    # unstack_wide.droplevel(0, axis=1)
    
    unstack_wide.columns = unstack_wide.columns.str.replace('value','')
    unstack_wide

     

    Results
    	subject	workshop	gender	q1	q2	q3	q4
    0	0	1		f	1.0	1.0	5.0	1.0
    1	1	2		f	2.0	1.0	4.0	1.0
    2	2	1		f	2.0	2.0	4.0	3.0
    3	3	2		f	3.0	1.0	NaN	3.0
    4	4	1		m	4.0	5.0	2.0	4.0
    5	5	2		m	5.0	4.0	5.0	5.0
    6	6	1		m	5.0	3.0	4.0	4.0
    7	7	2		m	4.0	5.0	5.0	5.0
     

     


    7. Python - dfply

     

    Python Programming
    import pandas as pd
    from dfply import *
    
    mydata   = pd.read_csv("c:/work/data/mydata.csv",sep=",",
                           dtype={'id':object,'workshop':object,
                                  'q1':int, 'q2':int, 'q3':float, 'q4':int},
                           na_values=['NaN'],skipinitialspace =True)
    
    withmooc= mydata.copy()
    
    # 모든 변수 선택하기.
    withmooc

     

    Results
    	id	workshop	gender	q1	q2	q3	q4
    0	1	1		f	1	1	5.0	1
    1	2	2		f	2	1	4.0	1
    2	3	1		f	2	2	4.0	3
    3	4	2		f	3	1	NaN	3
    4	5	1		m	4	5	2.0	4
    5	6	2		m	5	4	5.0	5
    6	7	1		m	5	3	4.0	4
    7	8	2		m	4	5	5.0	5

     


    1) wide to long : gather 함수

     

    Python Programming
    pivot_long = withmooc >> gather('variable', 'value', ['q1', 'q2','q3','q4'])
    pivot_long.head(10)

     

    Results
    	id	workshop	gender	variable	value
    0	1	1		f	q1		1.0
    1	2	2		f	q1		2.0
    2	3	1		f	q1		2.0
    3	4	2		f	q1		3.0
    4	5	1		m	q1		4.0
    5	6	2		m	q1		5.0
    6	7	1		m	q1		5.0
    7	8	2		m	q1		4.0
    8	1	1		f	q2		1.0
    9	2	2		f	q2		1.0

     


    2) long to wide : spread 함수

     

    Python Programming
    pivot_long >> spread(X.variable, X.value)

     

    Results
    	id	workshop	gender	q1	q2	q3	q4
    0	1	1		f	1.0	1.0	5.0	1.0
    1	2	2		f	2.0	1.0	4.0	1.0
    2	3	1		f	2.0	2.0	4.0	3.0
    3	4	2		f	3.0	1.0	NaN	3.0
    4	5	1		m	4.0	5.0	2.0	4.0
    5	6	2		m	5.0	4.0	5.0	5.0
    6	7	1		m	5.0	3.0	4.0	4.0
    7	8	2		m	4.0	5.0	5.0	5.0
     

     


     

    통계프로그램 비교 목록(Proc sql, SAS, SPSS, R 프로그래밍, R Tidyverse, Python Pandas, Python Dfply)
    [Oracle, Pandas, R Prog, Dplyr, Sqldf, Pandasql, Data.Table] 오라클 함수와 R & Python 비교 사전 목록 링크
    [SQL, Pandas, R Prog, Dplyr, SQLDF, PANDASQL, DATA.TABLE]
    SQL EMP 예제로 만나는 테이블 데이터 처리 방법 리스트 링크
    반응형

    댓글