본문 바로가기
통계프로그램 비교 시리즈/데이터 전처리 비교

통계프로그램 비교 시리즈 – [변수와 관측치 선택] 3. 변수와 관측치를 동시에 선택

by 기서무나구물 2022. 1. 7.

포스팅 목차

    [변수와 관측치 선택]  3. 변수와 관측치를 동시에 선택

     


    1. Proc SQL

     

    • 성별이 남성인 관측치 중에서 선택된 변수로 데이터 구성.
    SAS Programming
    proc sql;
      create table mydata3 as
        select gender,q1,q2,q3,q4
        from   BACK.mydata
        where  gender='f';
    
      select * from mydata3;
    quit;
    
    proc sql;
      create table mydata3 as
        select gender,q1,q2,q3,q4
        from   BACK.mydata(keep=gender q:)
        where  gender='f';
    
      select * from mydata3;
    quit;
    
    proc sql;
      create table mydata3(keep=gender q:) as
        select gender,q1,q2,q3,q4
        from   BACK.mydata
        where  gender='f';
    
      select * from mydata3;
    quit;
    
    proc sql;
      create table mydata3(where=(gender='f')) as
        select gender,q1,q2,q3,q4
        from   BACK.mydata;
    
      select * from mydata3;
    quit;
    
    proc sql;
      create table mydata3 as
        select gender,q1,q2,q3,q4
        from   BACK.mydata(drop=id workshop)
        where  gender='f';
    
      select * from mydata3;
    quit;

     

    Results
    gender          q1        q2        q3        q4
    ------------------------------------------------
    f                1         1         5         1
    f                2         1         4         1
    f                2         2         4         3
    f                3         1         .         3

     


    2. SAS Programming

    • 성별이 남성인 관측치 중에서 선택된 변수로 데이터 구성.
    SAS Programming
    data mydata3;
     set BACK.mydata;
          where gender='f';
          KEEP gender q1--q4;
    run;
    
    proc print;run;
    
    data mydata3(where=(gender='f'));
     set BACK.mydata;
          KEEP gender q1--q4;
    run;
    
    proc print;run;
    
    data mydata3;
     set BACK.mydata;
          if gender='f' then output;
          KEEP gender q1--q4;
    run;
    
    proc print;run;
    
    data mydata3;
     set BACK.mydata;
          where gender='f';
          KEEP gender q:;
    run;
    
    proc print;run;
    
    data mydata3;
     set BACK.mydata(keep=gender q:);
          where gender='f';
    run;
    
    proc print;run;
    
    data mydata3;
     set BACK.mydata(drop=id workshop);
          where gender='f';
    run;
    
    proc print;run;

     

    Results
    OBS    gender    q1    q2    q3    q4
      1       f       1     1     5     1
      2       f       2     1     4     1
      3       f       2     2     4     3
      4       f       3     1     .     3

     

     


    3. SPSS

     

    SPSS Programming
    GET FILE=("c:\data\mydata.sav")
    
    COMPUTE X1=0.
    IF (q4 EQ 5 ) X1=1.
    
    COMPUTE X2=0.
    IF (q4 GE 4) X2=1.
    
    COMPUTE X3=0.
    IF (gender EQ 'f' AND Q4 GE 5) X3=1.
    
    COMPUTE scoreA=3*q1+q2.
    IF (gender='f') scoreA=2*q1+q2.
    
    COMPUTE scoreB=3*q1+q2.
    IF (workshop EQ 1 AND q4 GE 5) scoreB=2*q1+q2.
    
    EXECUTE.

     

     


    4. R Programming (R-PROJECT)

     

    R Programming
    from rpy2.robjects import r
    %load_ext rpy2.ipython

     

    Results
    The rpy2.ipython extension is already loaded. To reload it, use:
      %reload_ext rpy2.ipython

     

     

    R Programming
    %%R
    
    library(tidyverse)
    
    load(file="c://work/data/mydata.Rdata")
    
    withmooc = mydata
    
    attach(withmooc) # mydata를 기본 데이터 세트로 지정.
    
    withmooc

     

    Results
    R[write to console]: -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
    
    From cffi callback :
    Traceback (most recent call last):
      File "C:\Users\BACK\anaconda3\lib\site-packages\rpy2\rinterface_lib\callbacks.py", line 131, in _consolewrite_ex
        s = conversion._cchar_to_str_with_maxlen(buf, maxlen=n)
      File "C:\Users\BACK\anaconda3\lib\site-packages\rpy2\rinterface_lib\conversion.py", line 130, in _cchar_to_str_with_maxlen
        s = ffi.string(c, maxlen).decode('utf-8')
    UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position 0: invalid start byte
    R[write to console]: -- Conflicts ------------------------------------------ tidyverse_conflicts() --
    x dplyr::filter() masks stats::filter()
    x dplyr::lag()    masks stats::lag()

     

    Results
      workshop gender q1 q2 q3 q4
    1        1      f  1  1  5  1
    2        2      f  2  1  4  1
    3        1      f  2  2  4  3
    4        2      f  3  1 NA  3
    5        1      m  4  5  2  4
    6        2      m  5  4  5  5
    7        1      m  5  3  4  4
    8        2      m  4  5  5  5

     

     

    R Programming
    %%R
    
    withmooc[1:4,2:6]

     

    Results
      gender q1 q2 q3 q4
    1      f  1  1  5  1
    2      f  2  1  4  1
    3      f  2  2  4  3
    4      f  3  1 NA  3

     

     

    • 성별이 남성인 관측치 중에서 선택된 변수로 데이터 구성.
    R Programming
    %%R
    
    print( mydata[gender=="f", c("gender","q1","q2","q3","q4")] )

     

    Results
      gender q1 q2 q3 q4
    1      f  1  1  5  1
    2      f  2  1  4  1
    3      f  2  2  4  3
    4      f  3  1 NA  3

     

     

    R Programming
    %%R
    
    subset(mydata,subset=gender=="f",select=gender:q4)

     

    Results
      gender q1 q2 q3 q4
    1      f  1  1  5  1
    2      f  2  1  4  1
    3      f  2  2  4  3
    4      f  3  1 NA  3

     

     


    5. R - Tidyverse

     

    R Programming
    %%R
    library(tidyverse)
    
    load(file="c://work/data/mydata.Rdata")
    
    withmooc = mydata
    
    attach(withmooc) # mydata를 기본 데이터 세트로 지정.
    
    withmooc

     

    Results
    R[write to console]: The following objects are masked from withmooc (pos = 3):
    
        gender, q1, q2, q3, q4, workshop
    

     

    Results
      workshop gender q1 q2 q3 q4
    1        1      f  1  1  5  1
    2        2      f  2  1  4  1
    3        1      f  2  2  4  3
    4        2      f  3  1 NA  3
    5        1      m  4  5  2  4
    6        2      m  5  4  5  5
    7        1      m  5  3  4  4
    8        2      m  4  5  5  5

     

     

    R Programming
    %%R
    
    withmooc %>%
      dplyr::slice(1:4) %>%
      dplyr::select(2:6)

     

    Results
      gender q1 q2 q3 q4
    1      f  1  1  5  1
    2      f  2  1  4  1
    3      f  2  2  4  3
    4      f  3  1 NA  3

     

     

    R Programming
    %%R
    
    myMales <- withmooc %>% 
        dplyr::filter(gender == "f") %>% 
        dplyr::select(2:6)
    
    print(myMales)

     

    Results
      gender q1 q2 q3 q4
    1      f  1  1  5  1
    2      f  2  1  4  1
    3      f  2  2  4  3
    4      f  3  1 NA  3

     

     

    R Programming
    %%R
    
    withmooc %>%
      dplyr::filter(gender=="f") %>%
      dplyr::select("gender","q1","q2","q3","q4")

     

    Results
      gender q1 q2 q3 q4
    1      f  1  1  5  1
    2      f  2  1  4  1
    3      f  2  2  4  3
    4      f  3  1 NA  3

     

     

    R Programming
    %%R
    
    withmooc[gender=="f", c("gender","q1","q2","q3","q4")]

     

    Results
      gender q1 q2 q3 q4
    1      f  1  1  5  1
    2      f  2  1  4  1
    3      f  2  2  4  3
    4      f  3  1 NA  3

     

     


    6. Python - Pandas

     

    Python Programming
    import pandas as pd
    import numpy as np
    import sweetviz as sv
    
    mydata   = pd.read_csv("C:/work/data/mydata.csv",sep=",",
                           dtype={'id':object,'workshop':object,
                                  'q1':int, 'q2':int, 'q3':float, 'q4':int},
                           na_values=['NaN'],skipinitialspace =True)
    
    mydata =mydata.drop('id',axis=1)
    
    withmooc= mydata.copy()
    
    withmooc

     

    Results
    	workshop	gender	q1	q2	q3	q4
    0	1		f	1	1	5.0	1
    1	2		f	2	1	4.0	1
    2	1		f	2	2	4.0	3
    3	2		f	3	1	NaN	3
    4	1		m	4	5	2.0	4
    5	2		m	5	4	5.0	5
    6	1		m	5	3	4.0	4
    7	2		m	4	5	5.0	5

     

    • 성별이 남성인 관측치 중에서 선택된 변수로 데이터 구성.
    Python Programming
    withmooc.iloc[0:4,1:6]
    
    # 동일한 결과를 산출하는 프로그램.
    withmooc.loc[mydata.gender=="f", ["gender","q1","q2","q3","q4"]]

     

    Results
    	gender	q1	q2	q3	q4
    0		f	1	1	5.0	1
    1		f	2	1	4.0	1
    2		f	2	2	4.0	3
    3		f	3	1	NaN	3

     


    7. Python - dfply

     

    Python Programming
    import pandas as pd
    from dfply import *
    
    mydata   = pd.read_csv("C:/work/data/mydata.csv",sep=",",
                           dtype={'id':object,'workshop':object,
                                  'q1':int, 'q2':int, 'q3':float, 'q4':int},
                           na_values=['NaN'],skipinitialspace =True)
    
    mydata =mydata.drop('id',axis=1)
    
    withmooc= mydata.copy()
    
    withmooc

     

    Results
    	workshop	gender	q1	q2	q3	q4
    0	1		f	1	1	5.0	1
    1	2		f	2	1	4.0	1
    2	1		f	2	2	4.0	3
    3	2		f	3	1	NaN	3
    4	1		m	4	5	2.0	4
    5	2		m	5	4	5.0	5
    6	1		m	5	3	4.0	4
    7	2		m	4	5	5.0	5

     

     

    Python Programming

     

    withmooc >> row_slice([0,1,2,3]) >> select(X.gender, X.q1, X.q2, X.q3,X.q4)

     

    Results
    	gender	q1	q2	q3	q4
    0		f	1	1	5.0	1
    1		f	2	1	4.0	1
    2		f	2	2	4.0	3
    3		f	3	1	NaN	3

     

     

    Python Programming
    withmooc >> row_slice((X.gender=="f")) >> select(1,2,3,4,5)

     

    Results
    	gender	q1	q2	q3	q4
    0	f	1	1	5.0	1
    1	f	2	1	4.0	1
    2	f	2	2	4.0	3
    3	f	3	1	NaN	3

     

     


     

    통계프로그램 비교 목록(Proc sql, SAS, SPSS, R 프로그래밍, R Tidyverse, Python Pandas, Python Dfply)
    [Oracle, Pandas, R Prog, Dplyr, Sqldf, Pandasql, Data.Table] 오라클 함수와 R & Python 비교 사전 목록 링크
    [SQL, Pandas, R Prog, Dplyr, SQLDF, PANDASQL, DATA.TABLE]
    SQL EMP 예제로 만나는 테이블 데이터 처리 방법 리스트 링크
    반응형

    댓글