본문 바로가기
통계프로그램 비교 시리즈/데이터 전처리 비교

[데이터 관리] 6. 관측값 포맷팅 & 관측값 일괄 변경하기

by 기서무나구물 2022. 1. 10.

포스팅 목차

    [데이터 관리] 6. 관측값 포맷팅 & 관측값 일괄 변경하기

     


    1. Proc SQL

     

    CASE함수를 이용하여 RECODING;

     

    SAS Programming
    options linesize=120;
    
    proc sql;
      create table withmooc as
        select a.*,
    
               case q1 when 1 then 2
                       when 5 then 4
               else q1 end as qr1,
    
               case q2 when 1 then 2
                       when 5 then 4
               else q2 end as qr2,
    
               case q3 when 1 then 2
                       when 5 then 4
               else q3 end as qr3,
    
               case q4 when 1 then 2
                       when 5 then 4
               else q4 end as qr4                 
    
        from   BACK.mydata a;
    
      select * from withmooc;
    quit;

     

    Results
    id  workshop  gender          q1        q2        q3        q4       qr1       qr2       qr3       qr4
     1         1  f                1         1         5         1         2         2         4         2
     2         2  f                2         1         4         1         2         2         4         2
     3         1  f                2         2         4         3         2         2         4         3
     4         2  f                3         1         .         3         3         2         .         3
     5         1  m                4         5         2         4         4         4         2         4
     6         2  m                5         4         5         5         4         4         4         4
     7         1  m                5         3         4         4         4         3         4         4
     8         2  m                4         5         5         5         4         4         4         4

     

     

    SAS Programming
    proc sql;
      select min(q1)  as q1_min,
             mean(q1) as q1_mean,
             max(q1)  as q1_max,
             sum(q1)  as q1_sum,
             min(q2)  as q2_min,
             mean(q2) as q2_mean,
             max(q2)  as q2_max,
             sum(q2)  as q2_sum
      from withmooc;
    quit;

     

    Results
    q1_min   q1_mean    q1_max    q1_sum    q2_min   q2_mean    q2_max    q2_sum
    ----------------------------------------------------------------------------
         1      3.25         5        26         1      2.75         5        22

     


    2. SAS Programming

     

    SAS Programming
    DATA withmooc;
         INFILE 'c:\data\mydata.csv' delimiter = ','
                MISSOVER DSD LRECL=32767 firstobs=2 ;
         INPUT id workshop gender $ q1 q2 q3 q4;
    run;
    
    PROC PRINT; RUN;
    
    PROC FORMAT;
         VALUE Agreement 1="Disagree"
                         2="Disagree"
                         3="Neutral"
                         4="Agree"   
                         5="Agree";
    run;
    
    
    
    # 5점 척도를 3점 척도로 변환. 1과 2를 하나의 척도로 4와 5를 또하나의 척도로 변환.;
    
    DATA withmooc1;
     SET withmooc;
         ARRAY q q1-q4;
         ARRAY qr qr1-qr4; *r for recoded;
    
         DO i=1 to 4;
            qr{i}=q{i};
            if      q{i}=1 then qr{i}=2;
            else if q{i}=5 then qr{i}=4;
         END;
    
         FORMAT q1-q4 qr1-qr4 Agreement.;
    RUN;
    
    proc print;run;

     

    Results
    id workshop gender q1       q2          q3    q4       qr1      qr2        qr3    qr4      i
     1     1      f    Disagree Disagree Agree    Disagree Disagree Disagree Agree    Disagree 5
     2     2      f    Disagree Disagree Agree    Disagree Disagree Disagree Agree    Disagree 5
     3     1      f    Disagree Disagree Agree    Neutral  Disagree Disagree Agree    Neutral  5
     4     2      f    Neutral  Disagree        . Neutral  Neutral  Disagree        . Neutral  5
     5     1      m    Agree    Agree    Disagree Agree    Agree    Agree    Disagree Agree    5
     6     2      m    Agree    Agree    Agree    Agree    Agree    Agree    Agree    Agree    5
     7     1      m    Agree    Neutral  Agree    Agree    Agree    Neutral  Agree    Agree    5
     8     2      m    Agree    Agree    Agree    Agree    Agree    Agree    Agree    Agree    5

     

     

    SAS Programming
    proc print data=withmooc1;
         format q1-q4 qr1-qr4;
    run;

     

    Results
    OBS    id    workshop    gender    q1    q2    q3    q4    qr1    qr2    qr3    qr4    i
     1      1        1         f        1     1     5     1     2      2      4      2     5
     2      2        2         f        2     1     4     1     2      2      4      2     5
     3      3        1         f        2     2     4     3     2      2      4      3     5
     4      4        2         f        3     1     .     3     3      2      .      3     5
     5      5        1         m        4     5     2     4     4      4      2      4     5
     6      6        2         m        5     4     5     5     4      4      4      4     5
     7      7        1         m        5     3     4     4     4      3      4      4     5
     8      8        2         m        4     5     5     5     4      4      4      4     5

     


    • 자동적으로 포맷 이용.
    SAS Programming
    PROC FREQ data=withmooc1;
         TABLES q1-q4;
    RUN;

     

    Results
                                       누적       누적
          q1      빈도      백분율      빈도      백분율
    ----------------------------------------------------
    Disagree           3     37.50           3     37.50
    Neutral            1     12.50           4     50.00
    Agree              4     50.00           8    100.00 
    
                                       누적       누적
          q2      빈도      백분율      빈도      백분율
    ----------------------------------------------------
    Disagree           4     50.00           4     50.00
    Neutral            1     12.50           5     62.50
    Agree              3     37.50           8    100.00 
    
                                       누적       누적
          q3      빈도      백분율      빈도      백분율
    ----------------------------------------------------
    Disagree           1     14.29           1     14.29
    Agree              6     85.71           7    100.00
    
                     결측값의 개수 = 1 
    
                                       누적       누적
          q4      빈도      백분율      빈도      백분율
    ----------------------------------------------------
    Disagree           2     25.00           2     25.00
    Neutral            2     25.00           4     50.00
    Agree              4     50.00           8    100.00

     


    • q변수들은 포맷을 적용 안 함.;
    • high/low 값은 1/5로 된다.;
    SAS Programming
    PROC UNIVARIATE data=withmooc1;
         VAR q1-q4;
    RUN;

     

    Results
    분위수(정의 5)
    
    분위수         추정값
    100% 최대값       5.0
     99%              5.0
     95%              5.0
     90%              5.0
     75% Q3           4.5
     50% 중위수       3.5
     25% Q1           2.0
     10%              1.0
      5%              1.0
      1%              1.0
      0% 최소값       1.0

     


    • qr변수들은 3점 척도 적용한다.;
    • high/low 값은 1/5로 된다.
    SAS Programming
    PROC UNIVARIATE data=withmooc1;
         VAR qr1-qr4;
    RUN;

     

    Results
      분위수         추정값
     100% 최대값       4.0
      99%              4.0
      95%              4.0
      90%              4.0
      75% Q3           4.0
      50% 중위수       3.5
      25% Q1           2.0
      10%              2.0
       5%              2.0
       1%              2.0
       0% 최소값       2.0

     


    3. SPSS

    • SPSS Program for Recoding Variables.
    SPSS Programming
    GET FILE='C:\mydata.sav'.
    RECODE q1 to q4 (1=2) (5=4).
    SAVE OUTFILE='C:\myleft.sav'.
    EXECUTE .

     


    4. R Programming (R-PROJECT)

     

    • Recoding 변수를 위한 R-Project 프로그램.
    R Programming
    from rpy2.robjects import r
    %load_ext rpy2.ipython

     

    Results
    The rpy2.ipython extension is already loaded. To reload it, use:
      %reload_ext rpy2.ipython

     

     

    R Programming
    %%R
    
    load(file="C:/work/data/mydata.Rdata")
    
    withmooc = mydata
    attach(withmooc) # mydata를 기본 데이터 세트로 지정.
    
    withmooc

     

    Results
      workshop gender q1 q2 q3 q4
    1        1      f  1  1  5  1
    2        2      f  2  1  4  1
    3        1      f  2  2  4  3
    4        2      f  3  1 NA  3
    5        1      m  4  5  2  4
    6        2      m  5  4  5  5
    7        1      m  5  3  4  4
    8        2      m  4  5  5  5

     

     

    • recode함수를 이용하여 관측값 변경;
    R Programming
    %%R
    
    # install.packages("car")
    library(car)

     

    Results
    From cffi callback <function _consolewrite_ex at 0x0000023AC5B63820>:
    Traceback (most recent call last):
      File "C:\Users\BACK\anaconda3\lib\site-packages\rpy2\rinterface_lib\callbacks.py", line 131, in _consolewrite_ex
        s = conversion._cchar_to_str_with_maxlen(buf, maxlen=n)
      File "C:\Users\BACK\anaconda3\lib\site-packages\rpy2\rinterface_lib\conversion.py", line 130, in _cchar_to_str_with_maxlen
        s = ffi.string(c, maxlen).decode('utf-8')
    UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc7 in position 0: invalid continuation byte

     

     

    R Programming
    %%R
    
    withmooc$q1<-car::recode(withmooc$q1,"1=2;5=4")
    withmooc$q2<-car::recode(withmooc$q2,"1=2;5=4")
    withmooc$q3<-car::recode(withmooc$q3,"1=2;5=4")
    withmooc$q4<-car::recode(withmooc$q4,"1=2;5=4")
    
    withmooc

     

    Results
      workshop gender q1 q2 q3 q4
    1        1      f  2  2  4  2
    2        2      f  2  2  4  2
    3        1      f  2  2  4  3
    4        2      f  3  2 NA  3
    5        1      m  4  4  2  4
    6        2      m  4  4  4  4
    7        1      m  4  3  4  4
    8        2      m  4  4  4  4

     

     

    • 변수명의 두 개의 세트 생성.
    R Programming
    %%R
    
    myQnames  <- base::paste( "q",  1:4, sep="")
    print(myQnames) # 원 변수명.
    
    myQRnames <- base::paste( "qr", 1:4, sep="")
    myQRnames       # Recoded될 변수의 이름.

     

    Results
    [1] "q1" "q2" "q3" "q4"
    [1] "qr1" "qr2" "qr3" "qr4"

     

     

    • 데이터 프레임에서 q변수를 추출.
    R Programming
    %%R
    
    myQRvars <- withmooc[ ,myQnames]
    
    myQRvars

     

    Results
      q1 q2 q3 q4
    1  2  2  4  2
    2  2  2  4  2
    3  2  2  4  3
    4  3  2 NA  3
    5  4  4  2  4
    6  4  4  4  4
    7  4  3  4  4
    8  4  4  4  4

     

     

    • 변수명을 변경.
    R Programming
    %%R
    
    names(myQRvars) <- myQRnames
    myQRvars

     

    Results
      qr1 qr2 qr3 qr4
    1   2   2   4   2
    2   2   2   4   2
    3   2   2   4   3
    4   3   2  NA   3
    5   4   4   2   4
    6   4   4   4   4
    7   4   3   4   4
    8   4   4   4   4

     


    • 많은 변수에 라벨을 적용하기 위해 함수 생성.
    R Programming
    %%R
    
    myRecoder <- function(x) { recode(x,"1=2;5=4") }

     


    • 하나의 변수에 함수 적용.
    R Programming
    %%R
    
    myRecoder(myQRvars$qr1)

     

    Results
    [1] 2 2 2 3 4 4 4 4

     


    • lapply함수를 사용하여 모든 변수에 적용하여 관측값 변경하기.
    R Programming
    %%R
    
    myQRvars[ ,myQRnames] <- lapply( myQRvars[ ,myQRnames ], myRecoder )
    
    myQRvars

     

    Results
      qr1 qr2 qr3 qr4
    1   2   2   4   2
    2   2   2   4   2
    3   2   2   4   3
    4   3   2  NA   3
    5   4   4   2   4
    6   4   4   4   4
    7   4   3   4   4
    8   4   4   4   4

     


    • 원 변수와 새 변수 합치기.
    R Programming
    %%R
    
    withmooc <- cbind(withmooc,myQRvars)
    
    withmooc

     

    Results
      workshop gender q1 q2 q3 q4 qr1 qr2 qr3 qr4
    1        1      f  2  2  4  2   2   2   4   2
    2        2      f  2  2  4  2   2   2   4   2
    3        1      f  2  2  4  3   2   2   4   3
    4        2      f  3  2 NA  3   3   2  NA   3
    5        1      m  4  4  2  4   4   4   2   4
    6        2      m  4  4  4  4   4   4   4   4
    7        1      m  4  3  4  4   4   3   4   4
    8        2      m  4  4  4  4   4   4   4   4

     

     

    R Programming
    %%R
    options(width = 150)
    
    summary(withmooc)

     

    Results
        workshop      gender                q1              q2              q3              q4            qr1             qr2             qr3             qr4  
     Min.   :1.0   Length:8           Min.   :2.000   Min.   :2.000   Min.   :2.000   Min.   :2.00   Min.   :2.000   Min.   :2.000   Min.   :2.000   Min.   :2.00  
     1st Qu.:1.0   Class :character   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:4.000   1st Qu.:2.75   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:4.000   1st Qu.:2.75  
     Median :1.5   Mode  :character   Median :3.500   Median :2.500   Median :4.000   Median :3.50   Median :3.500   Median :2.500   Median :4.000   Median :3.50 
     Mean   :1.5                      Mean   :3.125   Mean   :2.875   Mean   :3.714   Mean   :3.25   Mean   :3.125   Mean   :2.875   Mean   :3.714   Mean   :3.25
     3rd Qu.:2.0                      3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.00   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.00  
     Max.   :2.0                      Max.   :4.000   Max.   :4.000   Max.   :4.000   Max.   :4.00   Max.   :4.000   Max.   :4.000   Max.   :4.000   Max.   :4.00 
                                                                      NA's   :1                                                      NA's   :1      

     


    5. R - Tidyverse

    • Tip : Jupyter Notebook 상에서 R에서 생성된 데이터를 파이썬(ipython)으로 전달하기
    • %%R -o 파이썬으로 전 송하기 위한 데이터명
    R Programming
    %%R -o back
    
    back = mydata

     

    R Programming
    back

     

    Results
    	workshop	gender	q1	q2	q3		q4
    1	1		f	1	1	5		1
    2	2		f	2	1	4		1
    3	1		f	2	2	4		3
    4	2		f	3	1	-2147483648	3
    5	1		m	4	5	2		4
    6	2		m	5	4	5		5
    7	1		m	5	3	4		4
    8	2		m	4	5	5		5
     

     

     

    R Programming
    from rpy2.robjects import r
    %load_ext rpy2.ipython

     

    Results
    The rpy2.ipython extension is already loaded. To reload it, use:
      %reload_ext rpy2.ipython

     

     

    R Programming
    %%R
    
    library(tidyverse)
    load(file="C:/work/data/mydata.Rdata")
    
    withmooc = mydata
    
    attach(withmooc) # mydata를 기본 데이터 세트로 지정.
    
    withmooc
    R[write to console]: The following objects are masked from withmooc (pos = 8):
    
        gender, q1, q2, q3, q4, workshop
    
    
    
    
      workshop gender q1 q2 q3 q4
    1        1      f  1  1  5  1
    2        2      f  2  1  4  1
    3        1      f  2  2  4  3
    4        2      f  3  1 NA  3
    5        1      m  4  5  2  4
    6        2      m  5  4  5  5
    7        1      m  5  3  4  4
    8        2      m  4  5  5  5

     

     

    • recode함수를 이용하여 관측값 변경;
    R Programming
    %%R
    
    # install.packages("car")
    library(tidyverse)
    library(car)

     

     

    • dplyr의 Record 함수와 구분하기 위하여 car 패키지 지정
    R Programming
    %%R
    
    withmooc %>% mutate(q1=car::recode(q1,"1=2;5=4"),
                        q2=car::recode(q2,"1=2;5=4"),
                        q3=car::recode(q3,"1=2;5=4"),
                        q4=car::recode(q4,"1=2;5=4")
                        )

     

    Results
      workshop gender q1 q2 q3 q4
    1        1      f  2  2  4  2
    2        2      f  2  2  4  2
    3        1      f  2  2  4  3
    4        2      f  3  2 NA  3
    5        1      m  4  4  2  4
    6        2      m  4  4  4  4
    7        1      m  4  3  4  4
    8        2      m  4  4  4  4

     

     

    • 많은 변수에 라벨을 적용하기 위해 함수 생성.
    R Programming
    %%R
    
    myQnames  <- base::paste( "q",  1:4, sep="")
    print(myQnames) # 원 변수명.
    
    myQRvars <- withmooc[ ,myQnames]
    
    myQRvars

     

    Results
    [1] "q1" "q2" "q3" "q4"
      q1 q2 q3 q4
    1  1  1  5  1
    2  2  1  4  1
    3  2  2  4  3
    4  3  1 NA  3
    5  4  5  2  4
    6  5  4  5  5
    7  5  3  4  4
    8  4  5  5  5

     

     

    R Programming
    %%R
    
    withmooc = mydata
    
    myRecoder <- function(x) { car::recode(x,"1=2;5=4") }
    
    print( myRecoder(withmooc$q1) )

     

    Results
    [1] 2 2 2 3 4 4 4 4

     

     

    • map : 각 변수 별로 함수 적용 후 하나의 테이블로 재구성됨.
    R Programming
    %%R
    
    withmooc %>%
      select(q1,q2,q3,q4) %>%
      map(~ transmute(withmooc, {{.}} := myRecoder(.))) %>%
      as_tibble()

     

    Results
    # A tibble: 8 x 4
      q1$..1 q2$..1 q3$..1 q4$..1
       <dbl>  <dbl>  <dbl>  <dbl>
    1      2      2      4      2
    2      2      2      4      2
    3      2      2      4      3
    4      3      2     NA      3
    5      4      4      2      4
    6      4      4      4      4
    7      4      3      4      4
    8      4      4      4      4

     

     

     

    R Programming
    %%R
    
    withmooc %>%
      select(q1,q2,q3,q4) %>%
      purrr::map_df(myRecoder)

     

    Results
    # A tibble: 8 x 4
         q1    q2    q3    q4
      <dbl> <dbl> <dbl> <dbl>
    1     2     2     4     2
    2     2     2     4     2
    3     2     2     4     3
    4     3     2    NA     3
    5     4     4     2     4
    6     4     4     4     4
    7     4     3     4     4
    8     4     4     4     4

     

     

    R Programming
    %%R
    
    myQRvars <- withmooc %>% 
      purrr::keep(.p = is.numeric) %>% # 숫자형 변수 선택
      purrr::map_df(.x = .,
                    .f = myRecoder)
    
    myQRvars

     

    Results
    # A tibble: 8 x 5
      workshop    q1    q2    q3    q4
         <dbl> <dbl> <dbl> <dbl> <dbl>
    1        2     2     2     4     2
    2        2     2     2     4     2
    3        2     2     2     4     3
    4        2     3     2    NA     3
    5        2     4     4     2     4
    6        2     4     4     4     4
    7        2     4     3     4     4
    8        2     4     4     4     4

     

     

    • 원 변수와 새 변수 합치기.
    R Programming
    %%R
    
    withmooc <- cbind(withmooc,myQRvars)
    
    withmooc

     

    Results
      workshop gender q1 q2 q3 q4 workshop q1 q2 q3 q4
    1        1      f  1  1  5  1        2  2  2  4  2
    2        2      f  2  1  4  1        2  2  2  4  2
    3        1      f  2  2  4  3        2  2  2  4  3
    4        2      f  3  1 NA  3        2  3  2 NA  3
    5        1      m  4  5  2  4        2  4  4  2  4
    6        2      m  5  4  5  5        2  4  4  4  4
    7        1      m  5  3  4  4        2  4  3  4  4
    8        2      m  4  5  5  5        2  4  4  4  4

     

     

    • 통계량 일괄 산출
    R Programming
    %%R
    
    library(psych)
    library(mnormt)
    library(skimr)

     

     

    R Programming
    %%R
    
    withmooc %>% 
      purrr::keep(.p = is.numeric) %>% # 숫자형 데이터만 남기기
      purrr::map_df(.x = .,
                    .f = psych::describe)

     

    Results
            vars n mean   sd median trimmed  mad min max range  skew kurtosis   se
    X1...1     1 8 1.50 0.53    1.5    1.50 0.74   1   2     1  0.00    -2.23 0.19
    X1...2     1 8 3.25 1.49    3.5    3.25 2.22   1   5     4 -0.14    -1.73 0.53
    X1...3     1 8 2.75 1.75    2.5    2.75 2.22   1   5     4  0.19    -1.91 0.62
    X1...4     1 7 4.14 1.07    4.0    4.14 1.48   2   5     3 -0.93    -0.52 0.40
    X1...5     1 8 3.25 1.58    3.5    3.25 1.48   1   5     4 -0.36    -1.59 0.56
    X1...6     1 8 2.00 0.00    2.0    2.00 0.00   2   2     0   NaN      NaN 0.00
    X1...7     1 8 3.12 0.99    3.5    3.12 0.74   2   4     2 -0.20    -2.07 0.35
    X1...8     1 8 2.88 0.99    2.5    2.88 0.74   2   4     2  0.20    -2.07 0.35
    X1...9     1 7 3.71 0.76    4.0    3.71 0.00   2   4     2 -1.62     0.80 0.29
    X1...10    1 8 3.25 0.89    3.5    3.25 0.74   2   4     2 -0.40    -1.75 0.31

     

     

    R Programming
    %%R
    
    str(withmooc)

     

    Results
    'data.frame':    8 obs. of  11 variables:
     $ workshop: int  1 2 1 2 1 2 1 2
     $ gender  : chr  "f" "f" "f" "f" ...
     $ q1      : int  1 2 2 3 4 5 5 4
     $ q2      : int  1 1 2 1 5 4 3 5
     $ q3      : int  5 4 4 NA 2 5 4 5
     $ q4      : int  1 1 3 3 4 5 4 5
     $ workshop: num  2 2 2 2 2 2 2 2
     $ q1      : num  2 2 2 3 4 4 4 4
     $ q2      : num  2 2 2 2 4 4 3 4
     $ q3      : num  4 4 4 NA 2 4 4 4
     $ q4      : num  2 2 3 3 4 4 4 4

     

     

    R Programming
    %%R
    
    withmooc = mydata
    skimr::skim(withmooc)

     

    Results
    -- Data Summary ------------------------
                               Values  
    Name                       withmooc
    Number of rows             8       
    Number of columns          6       
    _______________________            
    Column type frequency:             
      character                1       
      numeric                  5       
    ________________________           
    Group variables            None    
    
    -- Variable type: character ----------------------------------------------------
    # A tibble: 1 x 8
      skim_variable n_missing complete_rate   min   max empty n_unique whitespace
    * <chr>             <int>         <dbl> <int> <int> <int>    <int>      <int>
    1 gender                0             1     1     1     0        2          0
    
    -- Variable type: numeric ------------------------------------------------------
    # A tibble: 5 x 11
      skim_variable n_missing complete_rate  mean    sd    p0   p25   p50   p75
    * <chr>             <int>         <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
    1 workshop              0         1      1.5  0.535     1   1     1.5  2   
    2 q1                    0         1      3.25 1.49      1   2     3.5  4.25
    3 q2                    0         1      2.75 1.75      1   1     2.5  4.25
    4 q3                    1         0.875  4.14 1.07      2   4     4    5   
    5 q4                    0         1      3.25 1.58      1   2.5   3.5  4.25
       p100 hist 
    * <dbl> <chr>
    1     2 <U+2587><U+2581><U+2581><U+2581><U+2587>
    2     5 <U+2583><U+2587><U+2583><U+2587><U+2587>
    3     5 <U+2587><U+2582><U+2582><U+2582><U+2585>
    4     5 <U+2582><U+2581><U+2581><U+2587><U+2587>
    5     5 <U+2587><U+2581><U+2587><U+2587><U+2587>

     


    6. Python - Pandas

     

    Python Programming
    import pandas as pd
    import numpy as np
    import sweetviz as sv
    from IPython.display import display  # DataFrame()을 HTML로 출력
    
    mydata   = pd.read_csv("C:/work/data/mydata.csv",sep=",",
                           dtype={'id':object,'workshop':object,
                                  'q1':int, 'q2':int, 'q3':float, 'q4':int},
                           na_values=['NaN'],skipinitialspace =True)
    
    withmooc= mydata.copy()
    
    withmooc

     

    Results
    	id	workshop	gender	q1	q2	q3	q4
    0	1	1		f	1	1	5.0	1
    1	2	2		f	2	1	4.0	1
    2	3	1		f	2	2	4.0	3
    3	4	2		f	3	1	NaN	3
    4	5	1		m	4	5	2.0	4
    5	6	2		m	5	4	5.0	5
    6	7	1		m	5	3	4.0	4
    7	8	2		m	4	5	5.0	5
     
     
    • Recoding 변수를 위한 R-Project 프로그램.
    Python Programming
    withmooc= mydata.copy()
    withmooc
    
    mapper = {1:2, 2:2, 3:3,4:4,5:4}
    
    withmooc['q1'] = withmooc['q1'].map(mapper)
    withmooc['q2'] = withmooc['q2'].map(mapper)
    withmooc['q3'] = withmooc['q3'].map(mapper)
    withmooc['q4'] = withmooc['q4'].map(mapper)
    
    withmooc

     

    Results
    	id	workshop	gender	q1	q2	q3	q4
    0	1	1		f	2	2	4.0	2
    1	2	2		f	2	2	4.0	2
    2	3	1		f	2	2	4.0	3
    3	4	2		f	3	2	NaN	3
    4	5	1		m	4	4	2.0	4
    5	6	2		m	4	4	4.0	4
    6	7	1		m	4	3	4.0	4
    7	8	2		m	4	4	4.0	4
     

     

     

     

    Python Programming
    withmooc= mydata.copy()
    display(withmooc)
    
    withmooc[['q1','q2','q3','q4']]= withmooc[['q1','q2','q3','q4']].applymap(lambda x: 2 if x == 1 else 4 if x==5 else x)
    withmooc

     

    Results
    	id	workshop	gender	q1	q2	q3	q4
    0	1	1		f	1	1	5.0	1
    1	2	2		f	2	1	4.0	1
    2	3	1		f	2	2	4.0	3
    3	4	2		f	3	1	NaN	3
    4	5	1		m	4	5	2.0	4
    5	6	2		m	5	4	5.0	5
    6	7	1		m	5	3	4.0	4
    7	8	2		m	4	5	5.0	5
     
    	id	workshop	gender	q1	q2	q3	q4
    0	1	1		f	2	2	4.0	2
    1	2	2		f	2	2	4.0	2
    2	3	1		f	2	2	4.0	3
    3	4	2		f	3	2	NaN	3
    4	5	1		m	4	4	2.0	4
    5	6	2		m	4	4	4.0	4
    6	7	1		m	4	3	4.0	4
    7	8	2		m	4	4	4.0	4
     
     
    • 변수명의 두 개의 세트 생성.
    • R 패키지의 paste 함수
    Python Programming
    myQnames = ["q" + str(x + 1) for x in range(4)]
    display(myQnames)
    
    # Recoded될 변수의 이름.
    myQRnames = ["qr" + str(x + 1) for x in range(4)]
    display(myQRnames)
    
    # 데이터 프레임에서 q변수를 추출.
    myQRvars = withmooc.loc[: ,myQnames]
    display(myQRvars)
    
    
    # 변수명을 변경.
    myQRvars.columns = myQRnames
    myQRvars

     

    Results
    ['q1', 'q2', 'q3', 'q4']
    
    
    
    ['qr1', 'qr2', 'qr3', 'qr4']

     

    Results
    	q1	q2	q3	q4
    0	2	2	4.0	2
    1	2	2	4.0	2
    2	2	2	4.0	3
    3	3	2	NaN	3
    4	4	4	2.0	4
    5	4	4	4.0	4
    6	4	3	4.0	4
    7	4	4	4.0	4

     

    Results
    	qr1	qr2	qr3	qr4
    0	2	2	4.0	2
    1	2	2	4.0	2
    2	2	2	4.0	3
    3	3	2	NaN	3
    4	4	4	2.0	4
    5	4	4	4.0	4
    6	4	3	4.0	4
    7	4	4	4.0	4
     
     
    • 많은 변수에 라벨을 적용하기 위해 함수 생성.
    Python Programming
    def myRecoder(x):
        if x == 1:
            return 2
        elif x == 5:
            return 4
        else:
            return x
    
    # 하나의 변수에 함수 적용.
    myQRvars['qr1'].apply(lambda x:myRecoder(x))

     

    Results
    0    2
    1    2
    2    2
    3    3
    4    4
    5    4
    6    4
    7    4
    Name: qr1, dtype: int64

     

     

    • lapply함수를 사용하여 모든 변수에 적용하기.
    Python Programming
    withmooc= mydata.copy()
    display(withmooc)
    
    withmooc[['q1','q2','q3','q4']]= withmooc[['q1','q2','q3','q4']].applymap(myRecoder)
    withmooc

     

    Results
    	id	workshop	gender	q1	q2	q3	q4
    0	1	1		f	1	1	5.0	1
    1	2	2		f	2	1	4.0	1
    2	3	1		f	2	2	4.0	3
    3	4	2		f	3	1	NaN	3
    4	5	1		m	4	5	2.0	4
    5	6	2		m	5	4	5.0	5
    6	7	1		m	5	3	4.0	4
    7	8	2		m	4	5	5.0	5

     

    Results
    	id	workshop	gender	q1	q2	q3	q4
    0	1	1		f	2	2	4.0	2
    1	2	2		f	2	2	4.0	2
    2	3	1		f	2	2	4.0	3
    3	4	2		f	3	2	NaN	3
    4	5	1		m	4	4	2.0	4
    5	6	2		m	4	4	4.0	4
    6	7	1		m	4	3	4.0	4
    7	8	2		m	4	4	4.0	4
     
     
    • 원 변수와 새 변수 합치기.
    Python Programming
    withmooc = pd.concat([withmooc,myQRvars],axis = 1)
    withmooc

     

    Results
    	id	workshop	gender	q1	q2	q3	q4	qr1	qr2	qr3	qr4
    0	1	1		f	2	2	4.0	2	2	2	4.0	2
    1	2	2		f	2	2	4.0	2	2	2	4.0	2
    2	3	1		f	2	2	4.0	3	2	2	4.0	3
    3	4	2		f	3	2	NaN	3	3	2	NaN	3
    4	5	1		m	4	4	2.0	4	4	4	2.0	4
    5	6	2		m	4	4	4.0	4	4	4	4.0	4
    6	7	1		m	4	3	4.0	4	4	3	4.0	4
    7	8	2		m	4	4	4.0	4	4	4	4.0	4
     
     
    • 통계량 산출
    Python Programming
    withmooc.describe()

     

    Results
    	q1		q2		q3		q4		qr1		qr2		qr3		qr4
    count	8.000000	8.000000	7.000000	8.000000	8.000000	8.000000	7.000000	8.000000
    mean	3.125000	2.875000	3.714286	3.250000	3.125000	2.875000	3.714286	3.250000
    std	0.991031	0.991031	0.755929	0.886405	0.991031	0.991031	0.755929	0.886405
    min	2.000000	2.000000	2.000000	2.000000	2.000000	2.000000	2.000000	2.000000
    25%	2.000000	2.000000	4.000000	2.750000	2.000000	2.000000	4.000000	2.750000
    50%	3.500000	2.500000	4.000000	3.500000	3.500000	2.500000	4.000000	3.500000
    75%	4.000000	4.000000	4.000000	4.000000	4.000000	4.000000	4.000000	4.000000
    max	4.000000	4.000000	4.000000	4.000000	4.000000	4.000000	4.000000	4.000000
     

    7. Python - dfply

     

    Python Programming
    import pandas as pd
    from dfply import *
    
    mydata   = pd.read_csv("c:/work/data/mydata.csv",sep=",",
                           dtype={'id':object,'workshop':object,
                                  'q1':int, 'q2':int, 'q3':float, 'q4':int},
                           na_values=['NaN'],skipinitialspace =True)
    
    
    withmooc= mydata.copy()
    
    # 모든 변수 선택하기.
    withmooc

     

    Results
    	id	workshop	gender	q1	q2	q3	q4
    0	1	1		f	1	1	5.0	1
    1	2	2		f	2	1	4.0	1
    2	3	1		f	2	2	4.0	3
    3	4	2		f	3	1	NaN	3
    4	5	1		m	4	5	2.0	4
    5	6	2		m	5	4	5.0	5
    6	7	1		m	5	3	4.0	4
    7	8	2		m	4	5	5.0	5
     
     
    • replace 함수를 사용하여서 1은 2로 5는 4로 변경
    Python Programming
    withmooc >> mutate(q11 = X.q1.replace({1, 5},{2,4})) 

     

    Results
    	id	workshop	gender	q1	q2	q3	q4	q11
    0	1	1		f	1	1	5.0	1	2
    1	2	2		f	2	1	4.0	1	2
    2	3	1		f	2	2	4.0	3	2
    3	4	2		f	3	1	NaN	3	3
    4	5	1		m	4	5	2.0	4	4
    5	6	2		m	5	4	5.0	5	4
    6	7	1		m	5	3	4.0	4	4
    7	8	2		m	4	5	5.0	5	4
     

     

     

    Python Programming
    withmooc >> mutate(q1=make_symbolic(np.where)( (X.q1==1) ,2, make_symbolic(np.where)( (X.q1==5) ,4,X.q1 )),
                       q2=make_symbolic(np.where)( (X.q2==1) ,2, make_symbolic(np.where)( (X.q2==5) ,4,X.q2 )),
                       q3=make_symbolic(np.where)( (X.q3==1) ,2, make_symbolic(np.where)( (X.q3==5) ,4,X.q3 )),
                       q4=make_symbolic(np.where)( (X.q4==1) ,2, make_symbolic(np.where)( (X.q4==5) ,4,X.q4 ))
                                                  )

     

    Results
    	id	workshop	gender	q1	q2	q3	q4
    0	1	1		f	2	2	4.0	2
    1	2	2		f	2	2	4.0	2
    2	3	1		f	2	2	4.0	3
    3	4	2		f	3	2	NaN	3
    4	5	1		m	4	4	2.0	4
    5	6	2		m	4	4	4.0	4
    6	7	1		m	4	3	4.0	4
    7	8	2		m	4	4	4.0	4
     

     

     


    Python Programming
    # [참고]
    
    withmooc >> mutate(a06=if_else((X['q3'].apply(pd.isnull)),300,1))

     

    Results
    	id	workshop	gender	q1	q2	q3	q4	a06
    0	1	1		f	1	1	5.0	1	1
    1	2	2		f	2	1	4.0	1	1
    2	3	1		f	2	2	4.0	3	1
    3	4	2		f	3	1	NaN	3	300
    4	5	1		m	4	5	2.0	4	1
    5	6	2		m	5	4	5.0	5	1
    6	7	1		m	5	3	4.0	4	1
    7	8	2		m	4	5	5.0	5	1
     

     

     

    Python Programming
    @make_symbolic
    def My5isNA(x):
        return make_symbolic(np.where)( (x==1) ,2, make_symbolic(np.where)( (x==5) ,4,x ))
    
    withmooc >> mutate(a06=X['q2'].apply(My5isNA))

     

    Results
    	id	workshop	gender	q1	q2	q3	q4	a06
    0	1	1		f	1	1	5.0	1	2
    1	2	2		f	2	1	4.0	1	2
    2	3	1		f	2	2	4.0	3	2
    3	4	2		f	3	1	NaN	3	2
    4	5	1		m	4	5	2.0	4	4
    5	6	2		m	5	4	5.0	5	4
    6	7	1		m	5	3	4.0	4	3
    7	8	2		m	4	5	5.0	5	4
     

     

     

    Python Programming
    @make_symbolic
    def My5isNA(x):
        return make_symbolic(np.where)( (x==1) ,2, make_symbolic(np.where)( (x==5) ,4,x ))
    
    @pipe
    @symbolic_evaluation()
    def symbolic_double(df, serieses):
        for series in serieses:
            df[series.name] = My5isNA(series)
        return df
    
    withmooc >> symbolic_double([X.q1, X.q2, X.q3, X.q4])

     

    Results
    	id	workshop	gender	q1	q2	q3	q4
    0	1	1		f	2	2	4.0	2
    1	2	2		f	2	2	4.0	2
    2	3	1		f	2	2	4.0	3
    3	4	2		f	3	2	NaN	3
    4	5	1		m	4	4	2.0	4
    5	6	2		m	4	4	4.0	4
    6	7	1		m	4	3	4.0	4
    7	8	2		m	4	4	4.0	4

     

     
    • 수치형 변수에 대한 자동 포맷팅
    Python Programming
    @make_symbolic
    def My5isNA(x):
        return make_symbolic(np.where)( (x==1) ,2 , make_symbolic(np.where)( (x==5) ,4,x ))
    
    withmooc >> mutate(**{
      **{f"{x}": X[x].apply(My5isNA) for x in withmooc.select_dtypes([int,float]).columns}
      })

     

    Results
    	id	workshop	gender	q1	q2	q3	q4
    0	1	1		f	2	2	4.0	2
    1	2	2		f	2	2	4.0	2
    2	3	1		f	2	2	4.0	3
    3	4	2		f	3	2	NaN	3
    4	5	1		m	4	4	2.0	4
    5	6	2		m	4	4	4.0	4
    6	7	1		m	4	3	4.0	4
    7	8	2		m	4	4	4.0	4

     

    • 통계량 일괄 산출
    Python Programming
    withmooc >> summarize(**{
      **{f"{x}_mean": X[x].mean() for x in mydata.select_dtypes(int).columns},
      **{f"{x}_std" : X[x].std()  for x in mydata.select_dtypes(int).columns}
      })

     

    Results
    	q1_mean	q2_mean	q4_mean		q1_std		q2_std		q4_std
    0	3.25	2.75	3.25		1.488048	1.752549	1.581139


     


     

    통계프로그램 비교 목록(Proc sql, SAS, SPSS, R 프로그래밍, R Tidyverse, Python Pandas, Python Dfply)
    [Oracle, Pandas, R Prog, Dplyr, Sqldf, Pandasql, Data.Table] 오라클 함수와 R & Python 비교 사전 목록 링크
    [SQL, Pandas, R Prog, Dplyr, SQLDF, PANDASQL, DATA.TABLE]
    SQL EMP 예제로 만나는 테이블 데이터 처리 방법 리스트 링크
    반응형

    댓글