본문 바로가기
통계프로그램 비교 시리즈/데이터 전처리 비교

통계프로그램 비교 시리즈 - 13. 데이터 프레임 정렬과 중복제거-Sorting & duplicate

by 기서무나구물 2022. 1. 15.

13. 데이터 프레임 정렬과 중복제거-Sorting & duplicate

 


1. Proc SQL

  • Order by 구분을 이용하여 데이터 소트;
SAS Programming
proc sql;
  select *
  from   BACK.mydata
  order by workshop;
quit;

 

Results
id  workshop  gender          q1        q2        q3        q4
--------------------------------------------------------------
 3         1  f                2         2         4         3
 5         1  m                4         5         2         4
 7         1  m                5         3         4         4
 1         1  f                1         1         5         1
 6         2  m                5         4         5         5
 2         2  f                2         1         4         1
 4         2  f                3         1         .         3
 8         2  m                4         5         5         5

 

 

SAS Programming
proc sql;
  select *
  from   BACK.mydata
  order by workshop,gender;
quit;

 

Results
id  workshop  gender          q1        q2        q3        q4
--------------------------------------------------------------
 3         1  f                2         2         4         3
 1         1  f                1         1         5         1
 7         1  m                5         3         4         4
 5         1  m                4         5         2         4
 2         2  f                2         1         4         1
 4         2  f                3         1         .         3
 6         2  m                5         4         5         5
 8         2  m                4         5         5         5

 

 

  • Descending를 이용하여 내림차순 소트;
SAS Programming
proc sql;
  select *
  from   BACK.mydata
  order by workshop,gender descending;
quit;

 

Results
id  workshop  gender          q1        q2        q3        q4
--------------------------------------------------------------
 5         1  m                4         5         2         4
 7         1  m                5         3         4         4
 3         1  f                2         2         4         3
 1         1  f                1         1         5         1
 6         2  m                5         4         5         5
 8         2  m                4         5         5         5
 2         2  f                2         1         4         1
 4         2  f                3         1         .         3

 

 

SAS Programming
proc sql;
  select *
  from   BACK.mydata
  order by workshop descending,gender descending;
quit;

 

Results
id  workshop  gender          q1        q2        q3        q4
--------------------------------------------------------------
 6         2  m                5         4         5         5
 8         2  m                4         5         5         5
 2         2  f                2         1         4         1
 4         2  f                3         1         .         3
 7         1  m                5         3         4         4
 5         1  m                4         5         2         4
 1         1  f                1         1         5         1
 3         1  f                2         2         4         3

 

 

  • 위에서 소트 된 데이터 세트에서 Sub-query를 이용하여 중복된 데이터 제거
SAS Programming
proc sql;
  select a.*
  from   BACK.mydata a
  where  id = ( select max(id)
                from   BACK.mydata b
                where  a.workshop = b.workshop
                and    a.gender   = b.gender);
quit;

 

Results
id  workshop  gender          q1        q2        q3        q4
--------------------------------------------------------------
 3         1  f                2         2         4         3
 4         2  f                3         1         .         3
 7         1  m                5         3         4         4
 8         2  m                4         5         5         5

 

 


2. SAS Programming

  • 데이터 소트를 위한 SAS 프로그램.
SAS Programming
PROC SORT  DATA=BACK.mydata out=mydata;
     BY workshop;
RUN;

PROC PRINT DATA=mydata; RUN;

 

Results
OBS    id    workshop    gender    q1    q2    q3    q4
 1      1        1         f        1     1     5     1
 2      3        1         f        2     2     4     3
 3      5        1         m        4     5     2     4
 4      7        1         m        5     3     4     4
 5      2        2         f        2     1     4     1
 6      4        2         f        3     1     .     3
 7      6        2         m        5     4     5     5
 8      8        2         m        4     5     5     5

 

 

SAS Programming
PROC SORT  DATA=BACK.mydata out=mydata; 
     BY gender workshop;
RUN;

PROC PRINT DATA=mydata; RUN;

 

Results
OBS    id    workshop    gender    q1    q2    q3    q4
 1      1        1         f        1     1     5     1
 2      3        1         f        2     2     4     3
 3      2        2         f        2     1     4     1
 4      4        2         f        3     1     .     3
 5      5        1         m        4     5     2     4
 6      7        1         m        5     3     4     4
 7      6        2         m        5     4     5     5
 8      8        2         m        4     5     5     5

 

 

SAS Programming
PROC SORT  DATA=BACK.mydata out=mydata;
     BY workshop descending gender;
RUN;

PROC PRINT DATA=mydata; RUN;

 

Results
OBS    id    workshop    gender    q1    q2    q3    q4
 1      5        1         m        4     5     2     4
 2      7        1         m        5     3     4     4
 3      1        1         f        1     1     5     1
 4      3        1         f        2     2     4     3
 5      6        2         m        5     4     5     5
 6      8        2         m        4     5     5     5
 7      2        2         f        2     1     4     1
 8      4        2         f        3     1     .     3

 

 

SAS Programming
PROC SORT  DATA=BACK.mydata out=mydata;
     BY descending workshop descending gender;
RUN;

PROC PRINT DATA=mydata; RUN;

 

Results
OBS    id    workshop    gender    q1    q2    q3    q4
 1      6        2         m        5     4     5     5
 2      8        2         m        4     5     5     5
 3      2        2         f        2     1     4     1
 4      4        2         f        3     1     .     3
 5      5        1         m        4     5     2     4
 6      7        1         m        5     3     4     4
 7      1        1         f        1     1     5     1
 8      3        1         f        2     2     4     3

 

 

SAS Programming
PROC SORT  DATA=BACK.mydata out=mydata ;
     BY workshop gender descending id;
RUN;


PROC SORT  DATA=mydata nodupkey;
     BY workshop gender;
RUN;


PROC PRINT DATA=mydata; RUN;

 

Results
OBS    id    workshop    gender    q1    q2    q3    q4
 1      3        1         f        2     2     4     3
 2      7        1         m        5     3     4     4
 3      4        2         f        3     1     .     3
 4      8        2         m        4     5     5     5

 

 


3. SPSS

  • SPSS Program to Sort Data.
SPSS Programming
SORT CASES BY workshop (A).
LIST.
EXECUTE.

SORT CASES BY gender (A) workshop (A).
LIST.
EXECUTE.

SORT CASES BY workshop (D) gender (A).
LIST.
EXECUTE.

 

 


4. R Programming (R-PROJECT)

 

R Programming
from rpy2.robjects import r
%load_ext rpy2.ipython

 

Results
The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython

 

 

R Programming
%%R

library(tidyverse)
library(psych)

library(reshape)
library(reshape2)


mydata <- read_csv("C:/work/data/mydata.csv", 
  col_types = cols( id       = col_double(),
                    workshop = col_character(),
                    gender   = col_character(),
                    q1       = col_double(),
                    q2       = col_double(),
                    q3       = col_double(),
                    q4       = col_double()
  )
)

withmooc = mydata

attach(withmooc) # mydata를 기본 데이터 세트로 지정.

withmooc

 

Results
R[write to console]: The following objects are masked from withmooc (pos = 3):

    gender, id, q1, q2, q3, q4, workshop




# A tibble: 8 x 7
     id workshop gender    q1    q2    q3    q4
  <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
1     1 1        f          1     1     5     1
2     2 2        f          2     1     4     1
3     3 1        f          2     2     4     3
4     4 2        f          3     1    NA     3
5     5 1        m          4     5     2     4
6     6 2        m          5     4     5     5
7     7 1        m          5     3     4     4
8     8 2        m          4     5     5     5

 

 

  • 인덱스를 이용한 데이터 프레임 소트
  • 처음 4개의 관측치를 출력.
R Programming
%%R

withmooc[ c(1,2,3,4), ]

 

Results
# A tibble: 4 x 7
     id workshop gender    q1    q2    q3    q4
  <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
1     1 1        f          1     1     5     1
2     2 2        f          2     1     4     1
3     3 1        f          2     2     4     3
4     4 2        f          3     1    NA     3

 

 

  • 인덱스 값을 거꾸로 삽입하여서 역순으로 출력한다.
R Programming
%%R

withmooc[ c(4,3,2,1), ] 

 

Results
# A tibble: 4 x 7
     id workshop gender    q1    q2    q3    q4
  <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
1     4 2        f          3     1    NA     3
2     3 1        f          2     2     4     3
3     2 2        f          2     1     4     1
4     1 1        f          1     1     5     1

 

 

  • 변수 workshop(1과2)를 기준으로 정렬.
  • Order함수는 정렬할 인덱스를 발견한다.
R Programming
%%R

mydataSorted<-withmooc[ order(mydata$workshop), ]

mydataSorted

 

Results
# A tibble: 8 x 7
     id workshop gender    q1    q2    q3    q4
  <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
1     1 1        f          1     1     5     1
2     3 1        f          2     2     4     3
3     5 1        m          4     5     2     4
4     7 1        m          5     3     4     4
5     2 2        f          2     1     4     1
6     4 2        f          3     1    NA     3
7     6 2        m          5     4     5     5
8     8 2        m          4     5     5     5

 

 

  • 성별,workshop변수를 기준으로 정렬.
R Programming
%%R

mydataSorted<-withmooc[ order( mydata$gender, mydata$workshop ), ]

mydataSorted

 

Results
# A tibble: 8 x 7
     id workshop gender    q1    q2    q3    q4
  <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
1     1 1        f          1     1     5     1
2     3 1        f          2     2     4     3
3     2 2        f          2     1     4     1
4     4 2        f          3     1    NA     3
5     5 1        m          4     5     2     4
6     7 1        m          5     3     4     4
7     6 2        m          5     4     5     5
8     8 2        m          4     5     5     5

 

 

  • 기본 소트 순서는 올림차순이다. 수치형 변수는 앞에 음수(-)를 넣어서 내림차순으로 정렬할 수 있다.
R Programming
%%R

mydataSorted<- withmooc[ order( withmooc$gender, - withmooc$q1 ), ]

mydataSorted

 

Results
# A tibble: 8 x 7
     id workshop gender    q1    q2    q3    q4
  <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
1     4 2        f          3     1    NA     3
2     2 2        f          2     1     4     1
3     3 1        f          2     2     4     3
4     1 1        f          1     1     5     1
5     6 2        m          5     4     5     5
6     7 1        m          5     3     4     4
7     5 1        m          4     5     2     4
8     8 2        m          4     5     5     5

 

 

  • 음수 대신에 REV함수를 사용하여서 내림차순으로 정렬할수 있다.
R Programming
%%R

mydataSorted<- mydata[ order( mydata$workshop,rev(mydata$gender) ), ]

mydataSorted

 

Results
# A tibble: 8 x 7
     id workshop gender    q1    q2    q3    q4
  <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
1     5 1        m          4     5     2     4
2     7 1        m          5     3     4     4
3     1 1        f          1     1     5     1
4     3 1        f          2     2     4     3
5     6 2        m          5     4     5     5
6     8 2        m          4     5     5     5
7     2 2        f          2     1     4     1
8     4 2        f          3     1    NA     3

 


5. R - Tidyverse

 

R Programming
from rpy2.robjects import r
%load_ext rpy2.ipython

 

Results
The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython

 

 

R Programming
%%R

library(tidyverse)
library(psych)
mydata <- read_csv("C:/work/data/mydata.csv", 
  col_types = cols( id       = col_double(),
                    workshop = col_character(),
                    gender   = col_character(),
                    q1       = col_double(),
                    q2       = col_double(),
                    q3       = col_double(),
                    q4       = col_double()
  )
)

withmooc = mydata

attach(withmooc) # mydata를 기본 데이터 세트로 지정.

withmooc

 

Results
R[write to console]: The following objects are masked from withmooc (pos = 4):

    gender, id, q1, q2, q3, q4, workshop


R[write to console]: The following objects are masked from withmooc (pos = 5):

    gender, id, q1, q2, q3, q4, workshop




# A tibble: 8 x 7
     id workshop gender    q1    q2    q3    q4
  <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
1     1 1        f          1     1     5     1
2     2 2        f          2     1     4     1
3     3 1        f          2     2     4     3
4     4 2        f          3     1    NA     3
5     5 1        m          4     5     2     4
6     6 2        m          5     4     5     5
7     7 1        m          5     3     4     4
8     8 2        m          4     5     5     5

 

 

  • 처음 4개의 관측치를 출력.
R Programming
%%R

withmooc %>%
  slice(1:4) %>%
  arrange(id)

 

Results
# A tibble: 4 x 7
     id workshop gender    q1    q2    q3    q4
  <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
1     1 1        f          1     1     5     1
2     2 2        f          2     1     4     1
3     3 1        f          2     2     4     3
4     4 2        f          3     1    NA     3

 

  • 인덱스 값을 거꾸로 삽입하여서 역순으로 출력한다.
R Programming
%%R

withmooc %>%
  slice(1:4) %>%
  arrange(desc(id))

 

Results
# A tibble: 4 x 7
     id workshop gender    q1    q2    q3    q4
  <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
1     4 2        f          3     1    NA     3
2     3 1        f          2     2     4     3
3     2 2        f          2     1     4     1
4     1 1        f          1     1     5     1

 


  • 변수 workshop(1과2)를 기준으로 정렬.
  • arrange를 사용하여 정렬
R Programming
%%R

withmooc %>%
  arrange(workshop)

 

Results
# A tibble: 8 x 7
     id workshop gender    q1    q2    q3    q4
  <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
1     1 1        f          1     1     5     1
2     3 1        f          2     2     4     3
3     5 1        m          4     5     2     4
4     7 1        m          5     3     4     4
5     2 2        f          2     1     4     1
6     4 2        f          3     1    NA     3
7     6 2        m          5     4     5     5
8     8 2        m          4     5     5     5

 


  • 성별,workshop변수를 기준으로 정렬.
R Programming
%%R

withmooc %>%
  arrange(gender, workshop)

 

Results
# A tibble: 8 x 7
     id workshop gender    q1    q2    q3    q4
  <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
1     1 1        f          1     1     5     1
2     3 1        f          2     2     4     3
3     2 2        f          2     1     4     1
4     4 2        f          3     1    NA     3
5     5 1        m          4     5     2     4
6     7 1        m          5     3     4     4
7     6 2        m          5     4     5     5
8     8 2        m          4     5     5     5

 


  • 기본 소트 순서는 올림차순이다. 변수 앞에 음수(-)를 넣으면 내림차순으로 정렬한다.
R Programming
%%R

withmooc %>%
  arrange(workshop, desc(gender))

 

Results
# A tibble: 8 x 7
     id workshop gender    q1    q2    q3    q4
  <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
1     5 1        m          4     5     2     4
2     7 1        m          5     3     4     4
3     1 1        f          1     1     5     1
4     3 1        f          2     2     4     3
5     6 2        m          5     4     5     5
6     8 2        m          4     5     5     5
7     2 2        f          2     1     4     1
8     4 2        f          3     1    NA     3

 


 

R Programming
%%R

withmooc %>%
  arrange(desc(workshop), desc(gender))

 

Results
# A tibble: 8 x 7
     id workshop gender    q1    q2    q3    q4
  <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
1     6 2        m          5     4     5     5
2     8 2        m          4     5     5     5
3     2 2        f          2     1     4     1
4     4 2        f          3     1    NA     3
5     5 1        m          4     5     2     4
6     7 1        m          5     3     4     4
7     1 1        f          1     1     5     1
8     3 1        f          2     2     4     3

 


  • 데이터 중복 제거 하기
R Programming
%%R

withmooc %>%
  arrange(workshop, gender,desc(id)) %>%
  distinct(workshop, gender, .keep_all = TRUE)

 

Results
# A tibble: 4 x 7
     id workshop gender    q1    q2    q3    q4
  <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
1     3 1        f          2     2     4     3
2     7 1        m          5     3     4     4
3     4 2        f          3     1    NA     3
4     8 2        m          4     5     5     5

 


 

R Programming
%%R

mydata %>%
  group_by(workshop, gender) %>%
  arrange(desc(id),.by_group=TRUE) %>%
  distinct(workshop, gender, .keep_all = TRUE)

 

Results
# A tibble: 4 x 7
# Groups:   workshop, gender [4]
     id workshop gender    q1    q2    q3    q4
  <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
1     3 1        f          2     2     4     3
2     7 1        m          5     3     4     4
3     4 2        f          3     1    NA     3
4     8 2        m          4     5     5     5

 


 

R Programming
%%R

mydata %>%
  arrange(workshop, gender, desc(id)) %>%
  group_by(workshop, gender) %>%
  slice(1)

 

Results
# A tibble: 4 x 7
# Groups:   workshop, gender [4]
     id workshop gender    q1    q2    q3    q4
  <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
1     3 1        f          2     2     4     3
2     7 1        m          5     3     4     4
3     4 2        f          3     1    NA     3
4     8 2        m          4     5     5     5

 


R Programming
%%R

mydata %>%
  group_by(workshop, gender) %>%
  arrange(desc(id),.by_group=TRUE) %>%
  slice(1)

 

Results
# A tibble: 4 x 7
# Groups:   workshop, gender [4]
     id workshop gender    q1    q2    q3    q4
  <dbl> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl>
1     3 1        f          2     2     4     3
2     7 1        m          5     3     4     4
3     4 2        f          3     1    NA     3
4     8 2        m          4     5     5     5

 


6. Python - Pandas

 

Python Programming
import pandas as pd
import numpy as np
import sweetviz as sv

mydata = pd.read_csv("C:/work/data/mydata.csv",sep=",",
                     dtype={'id':object,'workshop':object,
                            'q1':int, 'q2':int, 'q3':float, 'q4':int},
                     na_values=['NaN'],skipinitialspace =True)

withmooc= mydata.copy()

withmooc

 

Results
	id	workshop	gender	q1	q2	q3	q4
0	1	1		f	1	1	5.0	1
1	2	2		f	2	1	4.0	1
2	3	1		f	2	2	4.0	3
3	4	2		f	3	1	NaN	3
4	5	1		m	4	5	2.0	4
5	6	2		m	5	4	5.0	5
6	7	1		m	5	3	4.0	4
7	8	2		m	4	5	5.0	5

 


 

Python Programming
 
withmooc= mydata.copy()

# 처음 4개의 관측치를 출력.
withmooc.iloc[[0,1,2,3],]

 

Results
	id	workshop	gender	q1	q2	q3	q4
0	1	1		f	1	1	5.0	1
1	2	2		f	2	1	4.0	1
2	3	1		f	2	2	4.0	3
3	4	2		f	3	1	NaN	3

 

 
  • 인덱스 값을 거꾸로 삽입하여서 역순으로 출력한다.
Python Programming
withmooc.iloc[[3,2,1,0],]

 

Results
	id	workshop	gender	q1	q2	q3	q4
3	4	2		f	3	1	NaN	3
2	3	1		f	2	2	4.0	3
1	2	2		f	2	1	4.0	1
0	1	1		f	1	1	5.0	1

 

 
  • 변수 workshop(1과2)를 기준으로 정렬.
  • sort_values를 사용하여 데이터 정렬
Python Programming
mydataSorted = withmooc.sort_values(by='workshop') 
mydataSorted

 

Results
	id	workshop	gender	q1	q2	q3	q4
0	1	1		f	1	1	5.0	1
2	3	1		f	2	2	4.0	3
4	5	1		m	4	5	2.0	4
6	7	1		m	5	3	4.0	4
1	2	2		f	2	1	4.0	1
3	4	2		f	3	1	NaN	3
5	6	2		m	5	4	5.0	5
7	8	2		m	4	5	5.0	5

 

 
  • 성별,workshop변수를 기준으로 정렬.
Python Programming
mydataSorted = withmooc.sort_values(by=['gender','workshop']) 
mydataSorted

 

Results
	id	workshop	gender	q1	q2	q3	q4
0	1	1		f	1	1	5.0	1
2	3	1		f	2	2	4.0	3
1	2	2		f	2	1	4.0	1
3	4	2		f	3	1	NaN	3
4	5	1		m	4	5	2.0	4
6	7	1		m	5	3	4.0	4
5	6	2		m	5	4	5.0	5
7	8	2		m	4	5	5.0	5

 

 
  • 기본 소트 순서는 올림차순이다. ascending인자를 사용하여 정렬 방향 결정
Python Programming
mydataSorted = withmooc.sort_values(['workshop','gender'],ascending=[False,True]) 
mydataSorted

 

Results
	id	workshop	gender	q1	q2	q3	q4
1	2	2		f	2	1	4.0	1
3	4	2		f	3	1	NaN	3
5	6	2		m	5	4	5.0	5
7	8	2		m	4	5	5.0	5
0	1	1		f	1	1	5.0	1
2	3	1		f	2	2	4.0	3
4	5	1		m	4	5	2.0	4
6	7	1		m	5	3	4.0	4

 


  • 데이터 중복 제거 하기;
Python Programming
mydataSorted = withmooc.sort_values(['workshop','gender','id'],ascending=[True,True,False]) 
mydataSorted

 

Results
	id	workshop	gender	q1	q2	q3	q4
2	3	1		f	2	2	4.0	3
0	1	1		f	1	1	5.0	1
6	7	1		m	5	3	4.0	4
4	5	1		m	4	5	2.0	4
3	4	2		f	3	1	NaN	3
1	2	2		f	2	1	4.0	1
7	8	2		m	4	5	5.0	5
5	6	2		m	5	4	5.0	5

 


  • 사전에 정렬된 데이터의 결과를 가지고 중복 데이터 제거;
Python Programming
mydataSorted.drop_duplicates(subset=['workshop','gender'])

 

Results
	id	workshop	gender	q1	q2	q3	q4
2	3	1		f	2	2	4.0	3
6	7	1		m	5	3	4.0	4
3	4	2		f	3	1	NaN	3
7	8	2		m	4	5	5.0	5

 


 

Python Programming
 
mydataSorted.drop_duplicates(subset=['workshop'],keep='last')

 

Results
	id	workshop	gender	q1	q2	q3	q4
4	5	1		m	4	5	2.0	4
5	6	2		m	5	4	5.0	5

 


7. Python - dfply

 

Python Programming
import pandas as pd
from dfply import *

mydata   = pd.read_csv("c:/work/data/mydata.csv",sep=",",
                       dtype={'id':object,'workshop':object,
                              'q1':int, 'q2':int, 'q3':float, 'q4':int},
                       na_values=['NaN'],skipinitialspace =True)

withmooc= mydata.copy()

# 모든 변수 선택하기.
withmooc

 

Results
	id	workshop	gender	q1	q2	q3	q4
0	1	1		f	1	1	5.0	1
1	2	2		f	2	1	4.0	1
2	3	1		f	2	2	4.0	3
3	4	2		f	3	1	NaN	3
4	5	1		m	4	5	2.0	4
5	6	2		m	5	4	5.0	5
6	7	1		m	5	3	4.0	4
7	8	2		m	4	5	5.0	5
 
  • 처음 4개의 관측치를 출력.
Python Programming
withmooc >> \
  row_slice(list(range(0,4))) >> \
  arrange(X.id)

 

Results
	id	workshop	gender	q1	q2	q3	q4
0	1	1		f	1	1	5.0	1
1	2	2		f	2	1	4.0	1
2	3	1		f	2	2	4.0	3
3	4	2		f	3	1	NaN	3
 
  • 인덱스 값을 거꾸로 삽입하여서 역순으로 출력한다.
Python Programming
withmooc >> \
  row_slice(list(range(0,4))) >> \
  arrange(X.id, ascending=False)

 

Results
	id	workshop	gender	q1	q2	q3	q4
3	4	2		f	3	1	NaN	3
2	3	1		f	2	2	4.0	3
1	2	2		f	2	1	4.0	1
0	1	1		f	1	1	5.0	1
 
  • 변수 workshop(1과2)를 기준으로 정렬.
  • arrange함수를 사용하여 데이터 정렬
Python Programming
withmooc >> \
  arrange(X.workshop)

 

Results
	id	workshop	gender	q1	q2	q3	q4
0	1	1		f	1	1	5.0	1
2	3	1		f	2	2	4.0	3
4	5	1		m	4	5	2.0	4
6	7	1		m	5	3	4.0	4
1	2	2		f	2	1	4.0	1
3	4	2		f	3	1	NaN	3
5	6	2		m	5	4	5.0	5
7	8	2		m	4	5	5.0	5
 
  • 성별,workshop변수를 기준으로 정렬.
Python Programming
withmooc >> \
  arrange(X.gender, X.workshop)

 

Results
	id	workshop	gender	q1	q2	q3	q4
0	1	1		f	1	1	5.0	1
2	3	1		f	2	2	4.0	3
1	2	2		f	2	1	4.0	1
3	4	2		f	3	1	NaN	3
4	5	1		m	4	5	2.0	4
6	7	1		m	5	3	4.0	4
5	6	2		m	5	4	5.0	5
7	8	2		m	4	5	5.0	5
 
  • 기본 소트 순서는 올림차순이다. ascending인자를 사용하여 정렬 방향 결정
Python Programming
withmooc >> \
  arrange(X.workshop, X.gender, ascending=[True,False])

 

Results
	id	workshop	gender	q1	q2	q3	q4
4	5	1		m	4	5	2.0	4
6	7	1		m	5	3	4.0	4
0	1	1		f	1	1	5.0	1
2	3	1		f	2	2	4.0	3
5	6	2		m	5	4	5.0	5
7	8	2		m	4	5	5.0	5
1	2	2		f	2	1	4.0	1
3	4	2		f	3	1	NaN	3

 

Python Programming
 
mydata >> \
  arrange(X.workshop, X.gender, ascending=[False,False])

 

Results
	id	workshop	gender	q1	q2	q3	q4
5	6	2		m	5	4	5.0	5
7	8	2		m	4	5	5.0	5
1	2	2		f	2	1	4.0	1
3	4	2		f	3	1	NaN	3
4	5	1		m	4	5	2.0	4
6	7	1		m	5	3	4.0	4
0	1	1		f	1	1	5.0	1
2	3	1		f	2	2	4.0	3
 
  • 데이터 중복 제거 하기;
Python Programming
mydata >> \
  arrange(X.workshop, X.gender,X.id , ascending=[True,True,False]) >> \
  distinct(X.workshop, X.gender,keep='first')

 

Results
	id	workshop	gender	q1	q2	q3	q4
2	3	1		f	2	2	4.0	3
6	7	1		m	5	3	4.0	4
3	4	2		f	3	1	NaN	3
7	8	2		m	4	5	5.0	5

 

 

Python Programming
 
mydata >> \
  arrange(X.workshop, X.gender,X.id , ascending=[True,True,False]) >> \
  distinct(X.workshop, X.gender,keep='last')

 

Results
	id	workshop	gender	q1	q2	q3	q4
0	1	1		f	1	1	5.0	1
4	5	1		m	4	5	2.0	4
1	2	2		f	2	1	4.0	1
5	6	2		m	5	4	5.0	5

 

 

Python Programming
 
mydata >> \
  arrange(X.workshop, X.gender,X.id , ascending=[True,True,False]) >> \
  filter_by(~X[['workshop', 'gender']].duplicated( keep='first'))

 

Results
	id	workshop	gender	q1	q2	q3	q4
2	3	1		f	2	2	4.0	3
6	7	1		m	5	3	4.0	4
3	4	2		f	3	1	NaN	3
7	8	2		m	4	5	5.0	5

 

 

Python Programming
 
mydata >> \
  arrange(X.workshop, X.gender,X.id , ascending=[True,True,False]) >> \
  mask(~X[['workshop', 'gender']].duplicated())

 

Results
	id	workshop	gender	q1	q2	q3	q4
2	3	1		f	2	2	4.0	3
6	7	1		m	5	3	4.0	4
3	4	2		f	3	1	NaN	3
7	8	2		m	4	5	5.0	5
 

 


 

통계프로그램 비교 목록(Proc sql, SAS, SPSS, R 프로그래밍, R Tidyverse, Python Pandas, Python Dfply)
[Oracle, Pandas, R Prog, Dplyr, Sqldf, Pandasql, Data.Table] 오라클 함수와 R & Python 비교 사전 목록 링크
[SQL, Pandas, R Prog, Dplyr, SQLDF, PANDASQL, DATA.TABLE]
SQL EMP 예제로 만나는 테이블 데이터 처리 방법 리스트 링크
반응형

댓글