본문 바로가기
통계프로그램 비교 시리즈/데이터 전처리 비교

통계프로그램 비교 시리즈 - 13. 데이터 프레임 정렬과 중복제거-Sorting & duplicate

by 기서무나구물 2022. 1. 15.

포스팅 목차

    13. 데이터 프레임 정렬과 중복제거-Sorting & duplicate

     


    1. Proc SQL

    • Order by 구분을 이용하여 데이터 소트;
    SAS Programming
    proc sql;
      select *
      from   BACK.mydata
      order by workshop;
    quit;

     

    Results
    id  workshop  gender          q1        q2        q3        q4
    --------------------------------------------------------------
     3         1  f                2         2         4         3
     5         1  m                4         5         2         4
     7         1  m                5         3         4         4
     1         1  f                1         1         5         1
     6         2  m                5         4         5         5
     2         2  f                2         1         4         1
     4         2  f                3         1         .         3
     8         2  m                4         5         5         5

     

     

    SAS Programming
    proc sql;
      select *
      from   BACK.mydata
      order by workshop,gender;
    quit;

     

    Results
    id  workshop  gender          q1        q2        q3        q4
    --------------------------------------------------------------
     3         1  f                2         2         4         3
     1         1  f                1         1         5         1
     7         1  m                5         3         4         4
     5         1  m                4         5         2         4
     2         2  f                2         1         4         1
     4         2  f                3         1         .         3
     6         2  m                5         4         5         5
     8         2  m                4         5         5         5

     

     

    • Descending를 이용하여 내림차순 소트;
    SAS Programming
    proc sql;
      select *
      from   BACK.mydata
      order by workshop,gender descending;
    quit;

     

    Results
    id  workshop  gender          q1        q2        q3        q4
    --------------------------------------------------------------
     5         1  m                4         5         2         4
     7         1  m                5         3         4         4
     3         1  f                2         2         4         3
     1         1  f                1         1         5         1
     6         2  m                5         4         5         5
     8         2  m                4         5         5         5
     2         2  f                2         1         4         1
     4         2  f                3         1         .         3

     

     

    SAS Programming
    proc sql;
      select *
      from   BACK.mydata
      order by workshop descending,gender descending;
    quit;

     

    Results
    id  workshop  gender          q1        q2        q3        q4
    --------------------------------------------------------------
     6         2  m                5         4         5         5
     8         2  m                4         5         5         5
     2         2  f                2         1         4         1
     4         2  f                3         1         .         3
     7         1  m                5         3         4         4
     5         1  m                4         5         2         4
     1         1  f                1         1         5         1
     3         1  f                2         2         4         3

     

     

    • 위에서 소트 된 데이터 세트에서 Sub-query를 이용하여 중복된 데이터 제거
    SAS Programming
    proc sql;
      select a.*
      from   BACK.mydata a
      where  id = ( select max(id)
                    from   BACK.mydata b
                    where  a.workshop = b.workshop
                    and    a.gender   = b.gender);
    quit;

     

    Results
    id  workshop  gender          q1        q2        q3        q4
    --------------------------------------------------------------
     3         1  f                2         2         4         3
     4         2  f                3         1         .         3
     7         1  m                5         3         4         4
     8         2  m                4         5         5         5

     

     


    2. SAS Programming

    • 데이터 소트를 위한 SAS 프로그램.
    SAS Programming
    PROC SORT  DATA=BACK.mydata out=mydata;
         BY workshop;
    RUN;
    
    PROC PRINT DATA=mydata; RUN;

     

    Results
    OBS    id    workshop    gender    q1    q2    q3    q4
     1      1        1         f        1     1     5     1
     2      3        1         f        2     2     4     3
     3      5        1         m        4     5     2     4
     4      7        1         m        5     3     4     4
     5      2        2         f        2     1     4     1
     6      4        2         f        3     1     .     3
     7      6        2         m        5     4     5     5
     8      8        2         m        4     5     5     5

     

     

    SAS Programming
    PROC SORT  DATA=BACK.mydata out=mydata; 
         BY gender workshop;
    RUN;
    
    PROC PRINT DATA=mydata; RUN;

     

    Results
    OBS    id    workshop    gender    q1    q2    q3    q4
     1      1        1         f        1     1     5     1
     2      3        1         f        2     2     4     3
     3      2        2         f        2     1     4     1
     4      4        2         f        3     1     .     3
     5      5        1         m        4     5     2     4
     6      7        1         m        5     3     4     4
     7      6        2         m        5     4     5     5
     8      8        2         m        4     5     5     5

     

     

    SAS Programming
    PROC SORT  DATA=BACK.mydata out=mydata;
         BY workshop descending gender;
    RUN;
    
    PROC PRINT DATA=mydata; RUN;

     

    Results
    OBS    id    workshop    gender    q1    q2    q3    q4
     1      5        1         m        4     5     2     4
     2      7        1         m        5     3     4     4
     3      1        1         f        1     1     5     1
     4      3        1         f        2     2     4     3
     5      6        2         m        5     4     5     5
     6      8        2         m        4     5     5     5
     7      2        2         f        2     1     4     1
     8      4        2         f        3     1     .     3

     

     

    SAS Programming
    PROC SORT  DATA=BACK.mydata out=mydata;
         BY descending workshop descending gender;
    RUN;
    
    PROC PRINT DATA=mydata; RUN;

     

    Results
    OBS    id    workshop    gender    q1    q2    q3    q4
     1      6        2         m        5     4     5     5
     2      8        2         m        4     5     5     5
     3      2        2         f        2     1     4     1
     4      4        2         f        3     1     .     3
     5      5        1         m        4     5     2     4
     6      7        1         m        5     3     4     4
     7      1        1         f        1     1     5     1
     8      3        1         f        2     2     4     3

     

     

    SAS Programming
    PROC SORT  DATA=BACK.mydata out=mydata ;
         BY workshop gender descending id;
    RUN;
    
    
    PROC SORT  DATA=mydata nodupkey;
         BY workshop gender;
    RUN;
    
    
    PROC PRINT DATA=mydata; RUN;

     

    Results
    OBS    id    workshop    gender    q1    q2    q3    q4
     1      3        1         f        2     2     4     3
     2      7        1         m        5     3     4     4
     3      4        2         f        3     1     .     3
     4      8        2         m        4     5     5     5

     

     


    3. SPSS

    • SPSS Program to Sort Data.
    SPSS Programming
    SORT CASES BY workshop (A).
    LIST.
    EXECUTE.
    
    SORT CASES BY gender (A) workshop (A).
    LIST.
    EXECUTE.
    
    SORT CASES BY workshop (D) gender (A).
    LIST.
    EXECUTE.

     

     


    4. R Programming (R-PROJECT)

     

    R Programming
    from rpy2.robjects import r
    %load_ext rpy2.ipython

     

    Results
    The rpy2.ipython extension is already loaded. To reload it, use:
      %reload_ext rpy2.ipython

     

     

    R Programming
    %%R
    
    library(tidyverse)
    library(psych)
    
    library(reshape)
    library(reshape2)
    
    
    mydata <- read_csv("C:/work/data/mydata.csv", 
      col_types = cols( id       = col_double(),
                        workshop = col_character(),
                        gender   = col_character(),
                        q1       = col_double(),
                        q2       = col_double(),
                        q3       = col_double(),
                        q4       = col_double()
      )
    )
    
    withmooc = mydata
    
    attach(withmooc) # mydata를 기본 데이터 세트로 지정.
    
    withmooc

     

    Results
    R[write to console]: The following objects are masked from withmooc (pos = 3):
    
        gender, id, q1, q2, q3, q4, workshop
    
    
    
    
    # A tibble: 8 x 7
         id workshop gender    q1    q2    q3    q4
      <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
    1     1 1        f          1     1     5     1
    2     2 2        f          2     1     4     1
    3     3 1        f          2     2     4     3
    4     4 2        f          3     1    NA     3
    5     5 1        m          4     5     2     4
    6     6 2        m          5     4     5     5
    7     7 1        m          5     3     4     4
    8     8 2        m          4     5     5     5

     

     

    • 인덱스를 이용한 데이터 프레임 소트
    • 처음 4개의 관측치를 출력.
    R Programming
    %%R
    
    withmooc[ c(1,2,3,4), ]

     

    Results
    # A tibble: 4 x 7
         id workshop gender    q1    q2    q3    q4
      <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
    1     1 1        f          1     1     5     1
    2     2 2        f          2     1     4     1
    3     3 1        f          2     2     4     3
    4     4 2        f          3     1    NA     3

     

     

    • 인덱스 값을 거꾸로 삽입하여서 역순으로 출력한다.
    R Programming
    %%R
    
    withmooc[ c(4,3,2,1), ] 

     

    Results
    # A tibble: 4 x 7
         id workshop gender    q1    q2    q3    q4
      <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
    1     4 2        f          3     1    NA     3
    2     3 1        f          2     2     4     3
    3     2 2        f          2     1     4     1
    4     1 1        f          1     1     5     1

     

     

    • 변수 workshop(1과2)를 기준으로 정렬.
    • Order함수는 정렬할 인덱스를 발견한다.
    R Programming
    %%R
    
    mydataSorted<-withmooc[ order(mydata$workshop), ]
    
    mydataSorted

     

    Results
    # A tibble: 8 x 7
         id workshop gender    q1    q2    q3    q4
      <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
    1     1 1        f          1     1     5     1
    2     3 1        f          2     2     4     3
    3     5 1        m          4     5     2     4
    4     7 1        m          5     3     4     4
    5     2 2        f          2     1     4     1
    6     4 2        f          3     1    NA     3
    7     6 2        m          5     4     5     5
    8     8 2        m          4     5     5     5

     

     

    • 성별,workshop변수를 기준으로 정렬.
    R Programming
    %%R
    
    mydataSorted<-withmooc[ order( mydata$gender, mydata$workshop ), ]
    
    mydataSorted

     

    Results
    # A tibble: 8 x 7
         id workshop gender    q1    q2    q3    q4
      <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
    1     1 1        f          1     1     5     1
    2     3 1        f          2     2     4     3
    3     2 2        f          2     1     4     1
    4     4 2        f          3     1    NA     3
    5     5 1        m          4     5     2     4
    6     7 1        m          5     3     4     4
    7     6 2        m          5     4     5     5
    8     8 2        m          4     5     5     5

     

     

    • 기본 소트 순서는 올림차순이다. 수치형 변수는 앞에 음수(-)를 넣어서 내림차순으로 정렬할 수 있다.
    R Programming
    %%R
    
    mydataSorted<- withmooc[ order( withmooc$gender, - withmooc$q1 ), ]
    
    mydataSorted

     

    Results
    # A tibble: 8 x 7
         id workshop gender    q1    q2    q3    q4
      <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
    1     4 2        f          3     1    NA     3
    2     2 2        f          2     1     4     1
    3     3 1        f          2     2     4     3
    4     1 1        f          1     1     5     1
    5     6 2        m          5     4     5     5
    6     7 1        m          5     3     4     4
    7     5 1        m          4     5     2     4
    8     8 2        m          4     5     5     5

     

     

    • 음수 대신에 REV함수를 사용하여서 내림차순으로 정렬할수 있다.
    R Programming
    %%R
    
    mydataSorted<- mydata[ order( mydata$workshop,rev(mydata$gender) ), ]
    
    mydataSorted

     

    Results
    # A tibble: 8 x 7
         id workshop gender    q1    q2    q3    q4
      <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
    1     5 1        m          4     5     2     4
    2     7 1        m          5     3     4     4
    3     1 1        f          1     1     5     1
    4     3 1        f          2     2     4     3
    5     6 2        m          5     4     5     5
    6     8 2        m          4     5     5     5
    7     2 2        f          2     1     4     1
    8     4 2        f          3     1    NA     3

     


    5. R - Tidyverse

     

    R Programming
    from rpy2.robjects import r
    %load_ext rpy2.ipython

     

    Results
    The rpy2.ipython extension is already loaded. To reload it, use:
      %reload_ext rpy2.ipython

     

     

    R Programming
    %%R
    
    library(tidyverse)
    library(psych)
    mydata <- read_csv("C:/work/data/mydata.csv", 
      col_types = cols( id       = col_double(),
                        workshop = col_character(),
                        gender   = col_character(),
                        q1       = col_double(),
                        q2       = col_double(),
                        q3       = col_double(),
                        q4       = col_double()
      )
    )
    
    withmooc = mydata
    
    attach(withmooc) # mydata를 기본 데이터 세트로 지정.
    
    withmooc

     

    Results
    R[write to console]: The following objects are masked from withmooc (pos = 4):
    
        gender, id, q1, q2, q3, q4, workshop
    
    
    R[write to console]: The following objects are masked from withmooc (pos = 5):
    
        gender, id, q1, q2, q3, q4, workshop
    
    
    
    
    # A tibble: 8 x 7
         id workshop gender    q1    q2    q3    q4
      <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
    1     1 1        f          1     1     5     1
    2     2 2        f          2     1     4     1
    3     3 1        f          2     2     4     3
    4     4 2        f          3     1    NA     3
    5     5 1        m          4     5     2     4
    6     6 2        m          5     4     5     5
    7     7 1        m          5     3     4     4
    8     8 2        m          4     5     5     5

     

     

    • 처음 4개의 관측치를 출력.
    R Programming
    %%R
    
    withmooc %>%
      slice(1:4) %>%
      arrange(id)

     

    Results
    # A tibble: 4 x 7
         id workshop gender    q1    q2    q3    q4
      <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
    1     1 1        f          1     1     5     1
    2     2 2        f          2     1     4     1
    3     3 1        f          2     2     4     3
    4     4 2        f          3     1    NA     3

     

    • 인덱스 값을 거꾸로 삽입하여서 역순으로 출력한다.
    R Programming
    %%R
    
    withmooc %>%
      slice(1:4) %>%
      arrange(desc(id))

     

    Results
    # A tibble: 4 x 7
         id workshop gender    q1    q2    q3    q4
      <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
    1     4 2        f          3     1    NA     3
    2     3 1        f          2     2     4     3
    3     2 2        f          2     1     4     1
    4     1 1        f          1     1     5     1

     


    • 변수 workshop(1과2)를 기준으로 정렬.
    • arrange를 사용하여 정렬
    R Programming
    %%R
    
    withmooc %>%
      arrange(workshop)

     

    Results
    # A tibble: 8 x 7
         id workshop gender    q1    q2    q3    q4
      <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
    1     1 1        f          1     1     5     1
    2     3 1        f          2     2     4     3
    3     5 1        m          4     5     2     4
    4     7 1        m          5     3     4     4
    5     2 2        f          2     1     4     1
    6     4 2        f          3     1    NA     3
    7     6 2        m          5     4     5     5
    8     8 2        m          4     5     5     5

     


    • 성별,workshop변수를 기준으로 정렬.
    R Programming
    %%R
    
    withmooc %>%
      arrange(gender, workshop)

     

    Results
    # A tibble: 8 x 7
         id workshop gender    q1    q2    q3    q4
      <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
    1     1 1        f          1     1     5     1
    2     3 1        f          2     2     4     3
    3     2 2        f          2     1     4     1
    4     4 2        f          3     1    NA     3
    5     5 1        m          4     5     2     4
    6     7 1        m          5     3     4     4
    7     6 2        m          5     4     5     5
    8     8 2        m          4     5     5     5

     


    • 기본 소트 순서는 올림차순이다. 변수 앞에 음수(-)를 넣으면 내림차순으로 정렬한다.
    R Programming
    %%R
    
    withmooc %>%
      arrange(workshop, desc(gender))

     

    Results
    # A tibble: 8 x 7
         id workshop gender    q1    q2    q3    q4
      <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
    1     5 1        m          4     5     2     4
    2     7 1        m          5     3     4     4
    3     1 1        f          1     1     5     1
    4     3 1        f          2     2     4     3
    5     6 2        m          5     4     5     5
    6     8 2        m          4     5     5     5
    7     2 2        f          2     1     4     1
    8     4 2        f          3     1    NA     3

     


     

    R Programming
    %%R
    
    withmooc %>%
      arrange(desc(workshop), desc(gender))

     

    Results
    # A tibble: 8 x 7
         id workshop gender    q1    q2    q3    q4
      <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
    1     6 2        m          5     4     5     5
    2     8 2        m          4     5     5     5
    3     2 2        f          2     1     4     1
    4     4 2        f          3     1    NA     3
    5     5 1        m          4     5     2     4
    6     7 1        m          5     3     4     4
    7     1 1        f          1     1     5     1
    8     3 1        f          2     2     4     3

     


    • 데이터 중복 제거 하기
    R Programming
    %%R
    
    withmooc %>%
      arrange(workshop, gender,desc(id)) %>%
      distinct(workshop, gender, .keep_all = TRUE)

     

    Results
    # A tibble: 4 x 7
         id workshop gender    q1    q2    q3    q4
      <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
    1     3 1        f          2     2     4     3
    2     7 1        m          5     3     4     4
    3     4 2        f          3     1    NA     3
    4     8 2        m          4     5     5     5

     


     

    R Programming
    %%R
    
    mydata %>%
      group_by(workshop, gender) %>%
      arrange(desc(id),.by_group=TRUE) %>%
      distinct(workshop, gender, .keep_all = TRUE)

     

    Results
    # A tibble: 4 x 7
    # Groups:   workshop, gender [4]
         id workshop gender    q1    q2    q3    q4
      <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
    1     3 1        f          2     2     4     3
    2     7 1        m          5     3     4     4
    3     4 2        f          3     1    NA     3
    4     8 2        m          4     5     5     5

     


     

    R Programming
    %%R
    
    mydata %>%
      arrange(workshop, gender, desc(id)) %>%
      group_by(workshop, gender) %>%
      slice(1)

     

    Results
    # A tibble: 4 x 7
    # Groups:   workshop, gender [4]
         id workshop gender    q1    q2    q3    q4
      <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
    1     3 1        f          2     2     4     3
    2     7 1        m          5     3     4     4
    3     4 2        f          3     1    NA     3
    4     8 2        m          4     5     5     5

     


    R Programming
    %%R
    
    mydata %>%
      group_by(workshop, gender) %>%
      arrange(desc(id),.by_group=TRUE) %>%
      slice(1)

     

    Results
    # A tibble: 4 x 7
    # Groups:   workshop, gender [4]
         id workshop gender    q1    q2    q3    q4
      <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
    1     3 1        f          2     2     4     3
    2     7 1        m          5     3     4     4
    3     4 2        f          3     1    NA     3
    4     8 2        m          4     5     5     5

     


    6. Python - Pandas

     

    Python Programming
    import pandas as pd
    import numpy as np
    import sweetviz as sv
    
    mydata = pd.read_csv("C:/work/data/mydata.csv",sep=",",
                         dtype={'id':object,'workshop':object,
                                'q1':int, 'q2':int, 'q3':float, 'q4':int},
                         na_values=['NaN'],skipinitialspace =True)
    
    withmooc= mydata.copy()
    
    withmooc

     

    Results
    	id	workshop	gender	q1	q2	q3	q4
    0	1	1		f	1	1	5.0	1
    1	2	2		f	2	1	4.0	1
    2	3	1		f	2	2	4.0	3
    3	4	2		f	3	1	NaN	3
    4	5	1		m	4	5	2.0	4
    5	6	2		m	5	4	5.0	5
    6	7	1		m	5	3	4.0	4
    7	8	2		m	4	5	5.0	5

     


     

    Python Programming
     
    withmooc= mydata.copy()
    
    # 처음 4개의 관측치를 출력.
    withmooc.iloc[[0,1,2,3],]

     

    Results
    	id	workshop	gender	q1	q2	q3	q4
    0	1	1		f	1	1	5.0	1
    1	2	2		f	2	1	4.0	1
    2	3	1		f	2	2	4.0	3
    3	4	2		f	3	1	NaN	3

     

     
    • 인덱스 값을 거꾸로 삽입하여서 역순으로 출력한다.
    Python Programming
    withmooc.iloc[[3,2,1,0],]

     

    Results
    	id	workshop	gender	q1	q2	q3	q4
    3	4	2		f	3	1	NaN	3
    2	3	1		f	2	2	4.0	3
    1	2	2		f	2	1	4.0	1
    0	1	1		f	1	1	5.0	1

     

     
    • 변수 workshop(1과2)를 기준으로 정렬.
    • sort_values를 사용하여 데이터 정렬
    Python Programming
    mydataSorted = withmooc.sort_values(by='workshop') 
    mydataSorted

     

    Results
    	id	workshop	gender	q1	q2	q3	q4
    0	1	1		f	1	1	5.0	1
    2	3	1		f	2	2	4.0	3
    4	5	1		m	4	5	2.0	4
    6	7	1		m	5	3	4.0	4
    1	2	2		f	2	1	4.0	1
    3	4	2		f	3	1	NaN	3
    5	6	2		m	5	4	5.0	5
    7	8	2		m	4	5	5.0	5

     

     
    • 성별,workshop변수를 기준으로 정렬.
    Python Programming
    mydataSorted = withmooc.sort_values(by=['gender','workshop']) 
    mydataSorted

     

    Results
    	id	workshop	gender	q1	q2	q3	q4
    0	1	1		f	1	1	5.0	1
    2	3	1		f	2	2	4.0	3
    1	2	2		f	2	1	4.0	1
    3	4	2		f	3	1	NaN	3
    4	5	1		m	4	5	2.0	4
    6	7	1		m	5	3	4.0	4
    5	6	2		m	5	4	5.0	5
    7	8	2		m	4	5	5.0	5

     

     
    • 기본 소트 순서는 올림차순이다. ascending인자를 사용하여 정렬 방향 결정
    Python Programming
    mydataSorted = withmooc.sort_values(['workshop','gender'],ascending=[False,True]) 
    mydataSorted

     

    Results
    	id	workshop	gender	q1	q2	q3	q4
    1	2	2		f	2	1	4.0	1
    3	4	2		f	3	1	NaN	3
    5	6	2		m	5	4	5.0	5
    7	8	2		m	4	5	5.0	5
    0	1	1		f	1	1	5.0	1
    2	3	1		f	2	2	4.0	3
    4	5	1		m	4	5	2.0	4
    6	7	1		m	5	3	4.0	4

     


    • 데이터 중복 제거 하기;
    Python Programming
    mydataSorted = withmooc.sort_values(['workshop','gender','id'],ascending=[True,True,False]) 
    mydataSorted

     

    Results
    	id	workshop	gender	q1	q2	q3	q4
    2	3	1		f	2	2	4.0	3
    0	1	1		f	1	1	5.0	1
    6	7	1		m	5	3	4.0	4
    4	5	1		m	4	5	2.0	4
    3	4	2		f	3	1	NaN	3
    1	2	2		f	2	1	4.0	1
    7	8	2		m	4	5	5.0	5
    5	6	2		m	5	4	5.0	5

     


    • 사전에 정렬된 데이터의 결과를 가지고 중복 데이터 제거;
    Python Programming
    mydataSorted.drop_duplicates(subset=['workshop','gender'])

     

    Results
    	id	workshop	gender	q1	q2	q3	q4
    2	3	1		f	2	2	4.0	3
    6	7	1		m	5	3	4.0	4
    3	4	2		f	3	1	NaN	3
    7	8	2		m	4	5	5.0	5

     


     

    Python Programming
     
    mydataSorted.drop_duplicates(subset=['workshop'],keep='last')

     

    Results
    	id	workshop	gender	q1	q2	q3	q4
    4	5	1		m	4	5	2.0	4
    5	6	2		m	5	4	5.0	5

     


    7. Python - dfply

     

    Python Programming
    import pandas as pd
    from dfply import *
    
    mydata   = pd.read_csv("c:/work/data/mydata.csv",sep=",",
                           dtype={'id':object,'workshop':object,
                                  'q1':int, 'q2':int, 'q3':float, 'q4':int},
                           na_values=['NaN'],skipinitialspace =True)
    
    withmooc= mydata.copy()
    
    # 모든 변수 선택하기.
    withmooc

     

    Results
    	id	workshop	gender	q1	q2	q3	q4
    0	1	1		f	1	1	5.0	1
    1	2	2		f	2	1	4.0	1
    2	3	1		f	2	2	4.0	3
    3	4	2		f	3	1	NaN	3
    4	5	1		m	4	5	2.0	4
    5	6	2		m	5	4	5.0	5
    6	7	1		m	5	3	4.0	4
    7	8	2		m	4	5	5.0	5
     
    • 처음 4개의 관측치를 출력.
    Python Programming
    withmooc >> \
      row_slice(list(range(0,4))) >> \
      arrange(X.id)

     

    Results
    	id	workshop	gender	q1	q2	q3	q4
    0	1	1		f	1	1	5.0	1
    1	2	2		f	2	1	4.0	1
    2	3	1		f	2	2	4.0	3
    3	4	2		f	3	1	NaN	3
     
    • 인덱스 값을 거꾸로 삽입하여서 역순으로 출력한다.
    Python Programming
    withmooc >> \
      row_slice(list(range(0,4))) >> \
      arrange(X.id, ascending=False)

     

    Results
    	id	workshop	gender	q1	q2	q3	q4
    3	4	2		f	3	1	NaN	3
    2	3	1		f	2	2	4.0	3
    1	2	2		f	2	1	4.0	1
    0	1	1		f	1	1	5.0	1
     
    • 변수 workshop(1과2)를 기준으로 정렬.
    • arrange함수를 사용하여 데이터 정렬
    Python Programming
    withmooc >> \
      arrange(X.workshop)

     

    Results
    	id	workshop	gender	q1	q2	q3	q4
    0	1	1		f	1	1	5.0	1
    2	3	1		f	2	2	4.0	3
    4	5	1		m	4	5	2.0	4
    6	7	1		m	5	3	4.0	4
    1	2	2		f	2	1	4.0	1
    3	4	2		f	3	1	NaN	3
    5	6	2		m	5	4	5.0	5
    7	8	2		m	4	5	5.0	5
     
    • 성별,workshop변수를 기준으로 정렬.
    Python Programming
    withmooc >> \
      arrange(X.gender, X.workshop)

     

    Results
    	id	workshop	gender	q1	q2	q3	q4
    0	1	1		f	1	1	5.0	1
    2	3	1		f	2	2	4.0	3
    1	2	2		f	2	1	4.0	1
    3	4	2		f	3	1	NaN	3
    4	5	1		m	4	5	2.0	4
    6	7	1		m	5	3	4.0	4
    5	6	2		m	5	4	5.0	5
    7	8	2		m	4	5	5.0	5
     
    • 기본 소트 순서는 올림차순이다. ascending인자를 사용하여 정렬 방향 결정
    Python Programming
    withmooc >> \
      arrange(X.workshop, X.gender, ascending=[True,False])

     

    Results
    	id	workshop	gender	q1	q2	q3	q4
    4	5	1		m	4	5	2.0	4
    6	7	1		m	5	3	4.0	4
    0	1	1		f	1	1	5.0	1
    2	3	1		f	2	2	4.0	3
    5	6	2		m	5	4	5.0	5
    7	8	2		m	4	5	5.0	5
    1	2	2		f	2	1	4.0	1
    3	4	2		f	3	1	NaN	3

     

    Python Programming
     
    mydata >> \
      arrange(X.workshop, X.gender, ascending=[False,False])

     

    Results
    	id	workshop	gender	q1	q2	q3	q4
    5	6	2		m	5	4	5.0	5
    7	8	2		m	4	5	5.0	5
    1	2	2		f	2	1	4.0	1
    3	4	2		f	3	1	NaN	3
    4	5	1		m	4	5	2.0	4
    6	7	1		m	5	3	4.0	4
    0	1	1		f	1	1	5.0	1
    2	3	1		f	2	2	4.0	3
     
    • 데이터 중복 제거 하기;
    Python Programming
    mydata >> \
      arrange(X.workshop, X.gender,X.id , ascending=[True,True,False]) >> \
      distinct(X.workshop, X.gender,keep='first')

     

    Results
    	id	workshop	gender	q1	q2	q3	q4
    2	3	1		f	2	2	4.0	3
    6	7	1		m	5	3	4.0	4
    3	4	2		f	3	1	NaN	3
    7	8	2		m	4	5	5.0	5

     

     

    Python Programming
     
    mydata >> \
      arrange(X.workshop, X.gender,X.id , ascending=[True,True,False]) >> \
      distinct(X.workshop, X.gender,keep='last')

     

    Results
    	id	workshop	gender	q1	q2	q3	q4
    0	1	1		f	1	1	5.0	1
    4	5	1		m	4	5	2.0	4
    1	2	2		f	2	1	4.0	1
    5	6	2		m	5	4	5.0	5

     

     

    Python Programming
     
    mydata >> \
      arrange(X.workshop, X.gender,X.id , ascending=[True,True,False]) >> \
      filter_by(~X[['workshop', 'gender']].duplicated( keep='first'))

     

    Results
    	id	workshop	gender	q1	q2	q3	q4
    2	3	1		f	2	2	4.0	3
    6	7	1		m	5	3	4.0	4
    3	4	2		f	3	1	NaN	3
    7	8	2		m	4	5	5.0	5

     

     

    Python Programming
     
    mydata >> \
      arrange(X.workshop, X.gender,X.id , ascending=[True,True,False]) >> \
      mask(~X[['workshop', 'gender']].duplicated())

     

    Results
    	id	workshop	gender	q1	q2	q3	q4
    2	3	1		f	2	2	4.0	3
    6	7	1		m	5	3	4.0	4
    3	4	2		f	3	1	NaN	3
    7	8	2		m	4	5	5.0	5
     

     


     

    통계프로그램 비교 목록(Proc sql, SAS, SPSS, R 프로그래밍, R Tidyverse, Python Pandas, Python Dfply)
    [Oracle, Pandas, R Prog, Dplyr, Sqldf, Pandasql, Data.Table] 오라클 함수와 R & Python 비교 사전 목록 링크
    [SQL, Pandas, R Prog, Dplyr, SQLDF, PANDASQL, DATA.TABLE]
    SQL EMP 예제로 만나는 테이블 데이터 처리 방법 리스트 링크
    반응형

    댓글