포스팅 목차
* 파이썬 & R 패키지 호출 및 예제 데이터 생성 링크
[ AVG 함수 ]
AVG함수는 지정된 컬럼에 대한 조건을 만족하는 행중에서 Null을 제외한 평균을 반환한다. 숫자 데이터 타입에만 사용한다.
- 함수설명 : AVG 오라클 함수 링크
1. Oracle(오라클)
1. 집계함수
Oracle Programming |
SELECT AVG(SAL) "Average"
FROM EMP;
Results |
Average
---------------------------------------
2073.214285714285714285714285714285714286
2. 분석함수
Oracle Programming |
SELECT HIREDATE, SAL,
AVG(SAL) OVER (PARTITION BY DEPTNO) AS SAL_AVG
FROM emp
order
by deptno
Results |
HIREDATE SAL SAL_AVG
-----------------------------------------------------------------
09-JUN-81 2450 2916.666666666666666666666666666666666667
23-JAN-82 1300 2916.666666666666666666666666666666666667
17-NOV-81 5000 2916.666666666666666666666666666666666667
03-DEC-81 3000 2175
19-APR-87 3000 2175
02-APR-81 2975 2175
17-DEC-80 800 2175
23-MAY-87 1100 2175
22-FEB-81 1250 1566.666666666666666666666666666666666667
28-SEP-81 1250 1566.666666666666666666666666666666666667
08-SEP-81 1500 1566.666666666666666666666666666666666667
03-DEC-81 950 1566.666666666666666666666666666666666667
20-FEB-81 1600 1566.666666666666666666666666666666666667
01-MAY-81 2850 1566.666666666666666666666666666666666667
2. Python Pandas(파이썬)
1. 집계함수
- 전체 직원의 평균 급여를 계산
Python Programming |
emp['sal'].mean()
Results |
2073.214285714286
- deptno 부서별 평균 급여 계산
Python Programming |
emp.groupby('deptno')['sal'].mean()
Results |
deptno
10 2916.666667
20 2175.000000
30 1566.666667
Name: sal, dtype: float64
Python Programming |
emp.groupby('deptno')['sal'].apply(lambda x: x.mean())
Results |
deptno
10 2916.666667
20 2175.000000
30 1566.666667
Name: sal, dtype: float64
Python Programming |
emp.groupby('deptno')['sal'].agg(['mean']).reset_index()
Results |
deptno mean
0 10 2916.666667
1 20 2175.000000
2 30 1566.666667
2. 분석함수
Python Programming |
withmooc = emp.copy()
withmooc['sal_mean'] = emp.groupby('deptno')['sal'].transform('mean')
withmooc.sort_values('deptno').head()
Results |
Python Programming |
withmooc = emp.copy()
withmooc['sal_mean'] = emp.groupby('deptno')['sal'].transform(lambda x:x.mean())
withmooc.sort_values('deptno').head()
Results |
Python Programming |
withmooc = emp.copy()
group_mean = withmooc.groupby('deptno')['sal'].mean()
withmooc['sal_mean'] = withmooc['deptno'].map(group_mean)
withmooc.sort_values('deptno').head()
Results |
3. R Programming (R Package)
1. 집계함수
- 전체 직원의 평균 급여를 계산
R Programming |
%%R
mean(emp$sal)
Results |
[1] 2073.214
- deptno 부서별 평균 급여 계산
R Programming |
%%R
aggregate(sal ~ deptno, data=emp , FUN=mean)
Results |
deptno sal
1 10 2916.667
2 20 2175.000
3 30 1566.667
R Programming |
%%R
do.call(rbind, Map(mean, split(emp$sal, emp$deptno)))
Results |
[,1]
10 2916.667
20 2175.000
30 1566.667
R Programming |
%%R
do.call(rbind, lapply(split(emp$sal, emp$deptno), mean))
Results |
[,1]
10 2916.667
20 2175.000
30 1566.667
R Programming |
%%R
do.call(rbind, purrr::map_dfr(split(emp,emp$deptno), ~mean(.x$sal)) )
Results |
[,1]
10 2916.667
20 2175.000
30 1566.667
R Programming |
%%R
unlist(with(emp, by(sal, deptno, mean)))
Results |
deptno: 10
[1] 2916.667
------------------------------------------------------------
deptno: 20
[1] 2175
------------------------------------------------------------
deptno: 30
[1] 1566.667
R Programming |
%%R
# purrr::map_chr(split(emp,emp$deptno), ~median(.x$sal))
purrr::map_dfr(split(emp,emp$deptno), ~mean(.x$sal))
Results |
# A tibble: 1 x 3
10 20 30
1 2917. 2175 1567.
2. 분석함수
R Programming |
%%R
withmooc <- emp
withmooc['sal_mean'] = with(withmooc, ave(sal, deptno, FUN=mean)) # FUN =function(x) { mean(x) }
withmooc[order(withmooc$deptno),][1:5,]
Results |
# A tibble: 5 x 9
empno ename job mgr hiredate sal comm deptno sal_mean
1 7782 CLARK MANAGER 7839 1981-01-09 2450 NA 10 2917.
2 7839 KING PRESIDENT NA 1981-11-17 5000 NA 10 2917.
3 7934 MILLER CLERK 7782 1982-01-23 1300 NA 10 2917.
4 7369 SMITH CLERK 7902 1980-12-17 800 NA 20 2175
5 7566 JONES MANAGER 7839 1981-04-02 2975 NA 20 2175
R Programming |
%%R
withmooc <- emp
withmooc['sal_median'] = ave(x = withmooc$sal, withmooc$deptno, FUN = mean) # FUN = function (x) { min(x) }
withmooc[order(withmooc$deptno),][1:5, ]
Results |
# A tibble: 5 x 9
empno ename job mgr hiredate sal comm deptno sal_median
1 7782 CLARK MANAGER 7839 1981-01-09 2450 NA 10 2917.
2 7839 KING PRESIDENT NA 1981-11-17 5000 NA 10 2917.
3 7934 MILLER CLERK 7782 1982-01-23 1300 NA 10 2917.
4 7369 SMITH CLERK 7902 1980-12-17 800 NA 20 2175
5 7566 JONES MANAGER 7839 1981-04-02 2975 NA 20 2175
R Programming |
%%R
withmooc <- emp
mean_var <- with(withmooc, purrr::map_chr(base::split(withmooc,withmooc$deptno), ~mean(.x$sal)) )
# print(mean_var)
data.frame(withmooc, mean_var =rep(mean_var, table(withmooc$deptno)))[1:5, ]
Results |
10 20 30
"2916.666667" "2175.000000" "1566.666667"
empno ename job mgr hiredate sal comm deptno mean_var
1 7369 SMITH CLERK 7902 1980-12-17 800 NA 20 2916.666667
2 7499 ALLEN SALESMAN 7698 1981-02-20 1600 300 30 2916.666667
3 7521 WARD SALESMAN 7698 1981-02-22 1250 500 30 2916.666667
4 7566 JONES MANAGER 7839 1981-04-02 2975 NA 20 2175.000000
5 7654 MARTIN SALESMAN 7698 1981-09-28 1250 1400 30 2175.000000
R Programming |
%%R
withmooc <- emp
mean_var <- with(withmooc, tapply(sal, deptno, mean))
data.frame(emp, mean_var =rep(mean_var, table(withmooc$deptno)))[1:5, ]
# with(withmooc, data.frame(emp, mean_var=rep(tapply(sal, deptno, mean), table(deptno))) )
Results |
empno ename job mgr hiredate sal comm deptno mean_var
1 7369 SMITH CLERK 7902 1980-12-17 800 NA 20 2916.667
2 7499 ALLEN SALESMAN 7698 1981-02-20 1600 300 30 2916.667
3 7521 WARD SALESMAN 7698 1981-02-22 1250 500 30 2916.667
4 7566 JONES MANAGER 7839 1981-04-02 2975 NA 20 2175.000
5 7654 MARTIN SALESMAN 7698 1981-09-28 1250 1400 30 2175.000
R Programming |
%%R
library(plyr) # dplyr 보다 먼저 로딩
library(dplyr)
plyr::ddply(emp,.(deptno),transform,sal_mean = mean(sal))[1:5, ]
Results |
empno ename job mgr hiredate sal comm deptno sal_mean
1 7782 CLARK MANAGER 7839 1981-01-09 2450 NA 10 2916.667
2 7839 KING PRESIDENT NA 1981-11-17 5000 NA 10 2916.667
3 7934 MILLER CLERK 7782 1982-01-23 1300 NA 10 2916.667
4 7369 SMITH CLERK 7902 1980-12-17 800 NA 20 2175.000
5 7566 JONES MANAGER 7839 1981-04-02 2975 NA 20 2175.000
R Programming |
%%R
withmooc <- emp
mapping <- t( purrr::map_dfc( split(withmooc,withmooc$deptno), ~mean(.x$sal) ) )
trans_data <- melt(mapping, value.name="sal_mean")
merge(emp,trans_data,by.x='deptno',by.y='Var1')[1:5, ]
Results |
deptno empno ename job mgr hiredate sal comm Var2 sal_mean
1 10 7782 CLARK MANAGER 7839 1981-01-09 2450 NA 1 2916.667
2 10 7839 KING PRESIDENT NA 1981-11-17 5000 NA 1 2916.667
3 10 7934 MILLER CLERK 7782 1982-01-23 1300 NA 1 2916.667
4 20 7369 SMITH CLERK 7902 1980-12-17 800 NA 1 2175.000
5 20 7876 ADAMS CLERK 7788 1983-01-12 1100 NA 1 2175.000
4. R Dplyr Package
1. 집계함수
- 전체 직원의 평균 급여를 계산
R Programming |
%%R
emp %>%
summarize( sal_mean = mean(sal))
- deptno 부서별 평균 급여 계산
R Programming |
%%R
emp %>%
dplyr::group_by(deptno) %>%
dplyr::summarize(sal_mean = mean(sal))
Results |
summarise() ungrouping output (override with .groups argument)
# A tibble: 3 x 2
deptno sal_mean
1 10 2917.
2 20 2175
3 30 1567.
R Programming |
%%R
emp %>%
tbl_df() %>%
nest(-deptno) %>%
mutate(sal_mean = map(data, ~ mean(.$sal))) %>%
unnest(sal_mean)
Results |
# A tibble: 3 x 3
deptno data sal_mean
1 20 2175
2 30 1567.
3 10 2917.
2. 분석함수
R Programming |
%%R
emp %>%
dplyr::group_by(deptno) %>%
dplyr::mutate(sal_mean = mean(sal)) %>%
dplyr::arrange(deptno) %>%
head()
Results |
# A tibble: 6 x 9
# Groups: deptno [2]
empno ename job mgr hiredate sal comm deptno sal_mean
1 7782 CLARK MANAGER 7839 1981-01-09 2450 NA 10 2917.
2 7839 KING PRESIDENT NA 1981-11-17 5000 NA 10 2917.
3 7934 MILLER CLERK 7782 1982-01-23 1300 NA 10 2917.
4 7369 SMITH CLERK 7902 1980-12-17 800 NA 20 2175
5 7566 JONES MANAGER 7839 1981-04-02 2975 NA 20 2175
6 7788 SCOTT ANALYST 7566 1982-12-09 3000 NA 20 2175
- purrr 패키지
R Programming |
%%R
withmooc <- emp %>% mutate(deptno = as.character(deptno))
withmooc %>%
dplyr::left_join( ( purrr::map_dfr(split(withmooc,withmooc$deptno), ~mean(.x$sal)) %>%
gather(key = deptno, value = median_sal )
),
id = "deptno"
) %>%
dplyr::arrange(deptno) %>%
head()
Results |
Joining, by = "deptno"
# A tibble: 6 x 9
empno ename job mgr hiredate sal comm deptno median_sal
1 7782 CLARK MANAGER 7839 1981-01-09 2450 NA 10 2917.
2 7839 KING PRESIDENT NA 1981-11-17 5000 NA 10 2917.
3 7934 MILLER CLERK 7782 1982-01-23 1300 NA 10 2917.
4 7369 SMITH CLERK 7902 1980-12-17 800 NA 20 2175
5 7566 JONES MANAGER 7839 1981-04-02 2975 NA 20 2175
6 7788 SCOTT ANALYST 7566 1982-12-09 3000 NA 20 2175
5. R sqldf Package
- Contributed Files : https://www.sqlite.org/contrib?orderby=name
1. 집계함수
R Programming |
%%R
sqldf(" SELECT AVG(sal) Average FROM emp; ")
Results |
Average
1 2073.214
R Programming |
%%R
sqldf(" SELECT DEPTNO, AVG(sal) Average FROM emp GROUP BY DEPTNO; ")
Results |
deptno Average
1 10 2916.667
2 20 2175.000
3 30 1566.667
2. 분석함수
R Programming |
%%R
sqldf(" SELECT HIREDATE, SAL, AVG(SAL) OVER (PARTITION BY DEPTNO) AS SAL_AVG
FROM emp
order by deptno")[1:5, ]
Results |
hiredate sal SAL_AVG
1 1981-01-09 2450 2916.667
2 1981-11-17 5000 2916.667
3 1982-01-23 1300 2916.667
4 1980-12-17 800 2175.000
5 1981-04-02 2975 2175.000
6. Python pandasql Package
1. 집계함수
Python Programming |
ps.sqldf(" SELECT AVG(sal) Average FROM emp ")
Results |
Average
0 2073.214286
Python Programming |
ps.sqldf(" SELECT DEPTNO, AVG(sal) Average FROM emp GROUP BY DEPTNO ")
Results |
deptno Average
0 10 2916.666667
1 20 2175.000000
2 30 1566.666667
2. 분석함수
Python Programming |
ps.sqldf(" SELECT HIREDATE, SAL, AVG(SAL) OVER (PARTITION BY DEPTNO) AS SAL_AVG \
FROM emp \
order by deptno ").head()
Results |
hiredate sal SAL_AVG
0 1981/01/09 2450 2916.666667
1 1981/11/17 5000 2916.666667
2 1982/01/23 1300 2916.666667
3 1980/12/17 800 2175.000000
4 1981/04/02 2975 2175.000000
7. R data.table Package
1. 집계함수
R Programming |
%%R
DT <- data.table(emp)
dept_DT <- data.table(dept)
DT[, .(`sal_mean` = mean(sal, na.rm = TRUE))]
Results |
sal_mean
1: 2073.214
R Programming |
%%R
DT <- data.table(emp)
dept_DT <- data.table(dept)
DT[, list(sal_mean = mean(sal) ), by =deptno]
Results |
deptno sal_mean
1: 20 2175.000
2: 30 1566.667
3: 10 2916.667
2. 분석함수
R Programming |
%%R
DT <- data.table(emp)
dept_DT <- data.table(dept)
DT[, sal_mean := mean(sal), by = deptno][order(deptno),][1:5, ]
Results |
empno ename job mgr hiredate sal comm deptno sal_mean
1: 7782 CLARK MANAGER 7839 1981-01-09 2450 NA 10 2916.667
2: 7839 KING PRESIDENT NA 1981-11-17 5000 NA 10 2916.667
3: 7934 MILLER CLERK 7782 1982-01-23 1300 NA 10 2916.667
4: 7369 SMITH CLERK 7902 1980-12-17 800 NA 20 2175.000
5: 7566 JONES MANAGER 7839 1981-04-02 2975 NA 20 2175.000
R Programming |
%%R
DT <- data.table(emp)
dept_DT <- data.table(dept)
DT[, `:=` ( sal_mean = mean(unlist(.SD)) ),
by = deptno,
.SDcols = c("sal")][order(deptno),][1:5, ]
Results |
empno ename job mgr hiredate sal comm deptno sal_mean
1: 7782 CLARK MANAGER 7839 1981-01-09 2450 NA 10 2916.667
2: 7839 KING PRESIDENT NA 1981-11-17 5000 NA 10 2916.667
3: 7934 MILLER CLERK 7782 1982-01-23 1300 NA 10 2916.667
4: 7369 SMITH CLERK 7902 1980-12-17 800 NA 20 2175.000
5: 7566 JONES MANAGER 7839 1981-04-02 2975 NA 20 2175.000
8. Python DuckDB의 SQL
1. 집계함수
Python Programming |
%%sql
SELECT AVG(sal) "Average"
FROM emp;
Python Programming |
print( duckdb.sql(" SELECT AVG(sal) Average FROM emp ").df() )
Results |
Average
0 2073.214286
Python Programming |
%%sql
SELECT deptno,
AVG(SAL) AS SAL_AVG
FROM emp
group by 1
order by 1
Python Programming |
SELECT deptno, \
AVG(SAL) AS SAL_AVG \
FROM emp \
group by 1 \
order by 1 ").df()
Results |
deptno SAL_AVG
0 10 2916.666667
1 20 2175.000000
2 30 1566.666667
2. 분석함수
Python Programming |
%%sql
SELECT empno,HIREDATE, SAL,
AVG(SAL) OVER (PARTITION BY DEPTNO) AS SAL_AVG
FROM emp
order
by deptno
Python Programming |
duckdb.sql(" SELECT empno,HIREDATE, SAL, \
AVG(SAL) OVER (PARTITION BY DEPTNO) AS SAL_AVG \
FROM emp \
order \
by deptno ").df()
Results |
empno hiredate sal SAL_AVG
0 7782 1981/01/09 2450 2916.666667
1 7839 1981/11/17 5000 2916.666667
2 7934 1982/01/23 1300 2916.666667
3 7369 1980/12/17 800 2175.000000
4 7566 1981/04/02 2975 2175.000000
5 7788 1982/12/09 3000 2175.000000
6 7876 1983/01/12 1100 2175.000000
7 7902 1981/12/03 3000 2175.000000
8 7499 1981/02/20 1600 1566.666667
9 7521 1981/02/22 1250 1566.666667
10 7654 1981/09/28 1250 1566.666667
11 7698 1981/03/01 2850 1566.666667
12 7844 1981/09/08 1500 1566.666667
13 7900 1981/12/03 950 1566.666667
--------------------------------------------
[Oracle, Pandas, R Prog, Dplyr, Sqldf, Pandasql, Data.Table] 오라클 함수와 R & Python 비교 사전 목록 링크 |
오라클 SQL 함수(Oracle SQL Function) 목록 리스트 링크 |
[SQL, Pandas, R Prog, Dplyr, SQLDF, PANDASQL, DATA.TABLE] SQL EMP 예제로 만나는 테이블 데이터 처리 방법 리스트 링크 링크 |
반응형
댓글