基于R语言的主成分分析(PCA)实例详解
本质上来讲,主成分分析(PCA)主要是找到一个线性转换矩阵P,作用在矩阵X(X的列向量是一条记录,行向量是一个feature)上,使其转换(或称之为投影,投影可以使用矩阵形式表示)到一个新的空间中,得到矩阵Y。
R语言中内置两种PCA的实现分别为prcomp和princomp。前者采用SVD实现,后者采用上面的实对称矩阵对角化方式实现,两种的接口类似,只是前者的参数稍微多一些,具体如下:
>prcomp(x,...)
##S3method for class'formula'
prcomp(formula,data=NULL,subset,na.action,...)
##Default S3method:
prcomp(x,retx=TRUE,center=TRUE,scale.=FALSE,
tol=NULL,...)
##S3method for class'prcomp'
predict(object,newdata,...)
输出参数>names(pr)
[1]"sdev""rotation""center""scale""x""call"
>princomp(x,...)
##S3method for class'formula'
princomp(formula,data=NULL,subset,na.action,...)
##Default S3method:
princomp(x,cor=FALSE,scores=TRUE,covmat=NULL,
subset=rep_len(TRUE,nrow(as.matrix(x))),...)
##S3method for class'princomp'
predict(object,newdata,...)
输出参数>names(pr)
[1]"sdev""loadings""center""scale""n.obs""scores""call"
例子采用prcomp,旨在分析PCA分析的正过程和逆过程。
例子中数据都采用:
>conomy<-data.frame(
x1=c(149.3,161.2,171.5,175.5,180.8,190.7,
202.1,212.4,226.1,231.9,239.0),
x2=c(4.2,4.1,3.1,3.1,1.1,2.2,2.1,5.6,5.0,5.1,0.7),
x3=c(108.1,114.8,123.2,126.9,132.1,137.7,
146.0,154.1,162.3,164.3,167.6),
y=c(15.9,16.4,19.0,19.1,18.8,20.4,22.7,
26.5,28.1,27.6,26.3))
>conomy
x1x2x3y
1149.34.2108.115.9
2161.24.1114.816.4
3171.53.1123.219.0
4175.53.1126.919.1
5180.81.1132.118.8
6190.72.2137.720.4
7202.12.1146.022.7
8212.45.6154.126.5
9226.15.0162.328.1
10231.95.1164.327.6
11239.00.7167.626.3
实例1:PCA分析(不做标准化处理:不平移&不尺度化)
>pr<-prcomp(~x1+x2+x3,data=conomy,scale=F,center=F)#PCA分析(不做标准化处理:不平移&不尺度化)
>pr$x
PC1PC2PC3
1-184.3643-1.88162440.120442835
2-197.9363-0.9151848 1.293486904
3-211.1872-0.27573770.006869002
4-216.5936-0.5359147-0.619728451
5-223.90350.7587619-2.624453929
6-235.22570.5153778-1.098457784
7-249.32420.7229582-1.324352077
8-262.4610-2.4679990-0.232945692
9-278.3630-1.11693800.535960605
10-284.2423-0.2907220 2.085641330
11-291.8747 4.3590400 1.198693966
>cbind(conomy$x1,conomy$x2,conomy$x3)%*%pr$rotation#PCA正过程
PC1PC2PC3
[1,]-184.3643-1.88162440.120442835
[2,]-197.9363-0.9151848 1.293486904
[3,]-211.1872-0.27573770.006869002
[4,]-216.5936-0.5359147-0.619728451
[5,]-223.90350.7587619-2.624453929
[6,]-235.22570.5153778-1.098457784
[7,]-249.32420.7229582-1.324352077
[8,]-262.4610-2.4679990-0.232945692
[9,]-278.3630-1.11693800.535960605
[10,]-284.2423-0.2907220 2.085641330
[11,]-291.8747 4.3590400 1.198693966
>pr$x%*%solve(pr$rotation)#PCA逆过程
x1x2x3
1149.34.2108.1
2161.24.1114.8
3171.53.1123.2
4175.53.1126.9
5180.81.1132.1
6190.72.2137.7
7202.12.1146.0
8212.45.6154.1
9226.15.0162.3
10231.95.1164.3
11239.00.7167.6
实例2:PCA分析(部分标准化:只平移)
>conomy<-data.frame(
x1=c(149.3,161.2,171.5,175.5,180.8,190.7,
202.1,212.4,226.1,231.9,239.0),
x2=c(4.2,4.1,3.1,3.1,1.1,2.2,2.1,5.6,5.0,5.1,0.7),
x3=c(108.1,114.8,123.2,126.9,132.1,137.7,
146.0,154.1,162.3,164.3,167.6),
y=c(15.9,16.4,19.0,19.1,18.8,20.4,22.7,
26.5,28.1,27.6,26.3))
>pr<-prcomp(~x1+x2+x3,data=conomy,scale=F,center=T)#PCA分析(部分标准化:只平移)
>pr$x
PC1PC2PC3
1-55.2431860.8526477-0.6319214
2-41.6411980.4642629-1.7916913
3-28.396312-0.2823642-0.5011266
4-23.004179-0.11314370.2645521
5-15.693755-1.7829447 1.9673323
6-4.361502-0.94829860.7576149
79.734530-0.9774781 1.1588999
822.815447 2.6055542 1.1976004
938.749791 1.77566960.3621647
1044.662808 1.4979238-1.2531020
1152.377555-3.0918290-1.5303231
>cbind(conomy$x1-pr$center[1],conomy$x2-pr$center[2],conomy$x3-pr$center[3])%*%pr$rotation#PCA正过程PC1PC2PC3
[1,]-55.2431860.8526477-0.6319214
[2,]-41.6411980.4642629-1.7916913
[3,]-28.396312-0.2823642-0.5011266
[4,]-23.004179-0.11314370.2645521
[5,]-15.693755-1.7829447 1.9673323
[6,]-4.361502-0.94829860.7576149
[7,]9.734530-0.9774781 1.1588999
[8,]22.815447 2.6055542 1.1976004
[9,]38.749791 1.77566960.3621647
[10,]44.662808 1.4979238-1.2531020
[11,]52.377555-3.0918290-1.5303231
>temp<-pr$x%*%solve(pr$rotation)#PCA逆过程>cbind(temp[,1]+pr$center[1],temp[,2]+pr$center[2],temp[,3]+pr$center[3])
[,1][,2][,3]
1149.3 4.2108.1
2161.2 4.1114.8
3171.5 3.1123.2
4175.5 3.1126.9
5180.8 1.1132.1
6190.7 2.2137.7
7202.1 2.1146.0
8212.4 5.6154.1
9226.1 5.0162.3
10231.9 5.1164.3
11239.00.7167.6
实例3:PCA分析(标准化处理:平移&尺度化)
>conomy<-data.frame(
x1=c(149.3,161.2,171.5,175.5,180.8,190.7,
202.1,212.4,226.1,231.9,239.0),
x2=c(4.2,4.1,3.1,3.1,1.1,2.2,2.1,5.6,5.0,5.1,0.7),
x3=c(108.1,114.8,123.2,126.9,132.1,137.7,
146.0,154.1,162.3,164.3,167.6),
y=c(15.9,16.4,19.0,19.1,18.8,20.4,22.7,
26.5,28.1,27.6,26.3))
>pr<-prcomp(~x1+x2+x3,data=conomy,scale=T,center=T)#PCA分析(标准化处理:平移&尺度化)
>pr$x
PC1PC2PC3
1-2.12588720.63865815-0.02072230
2-1.61892730.55553922-0.07111317
3-1.1151675-0.07297970-0.02173008
4-0.8942966-0.082369980.01081318
5-0.6442081-1.306685230.07258248
6-0.1903514-0.659147450.02655252
70.3596219-0.743674470.04278124
80.9718018 1.354058770.06286252
9 1.55931590.964045580.02357446
10 1.7669951 1.01521706-0.04498818
11 1.9311034-1.66266195-0.08061267
>cbind((conomy$x1-pr$center[1])/pr$scale[1],(conomy$x2-pr$center[2])/pr$scale[2], (conomy$x3-pr$center[3])/pr$scale[3])%*%pr$rotation#PCA正过程PC1PC2PC3
[1,]-2.12588720.63865815-0.02072230
[2,]-1.61892730.55553922-0.07111317
[3,]-1.1151675-0.07297970-0.02173008
[4,]-0.8942966-0.082369980.01081318
[5,]-0.6442081-1.306685230.07258248
[6,]-0.1903514-0.659147450.02655252
[7,]0.3596219-0.743674470.04278124
[8,]0.9718018 1.354058770.06286252
[9,] 1.55931590.964045580.02357446
[10,] 1.7669951 1.01521706-0.04498818
[11,] 1.9311034-1.66266195-0.08061267
>temp<-pr$x%*%solve(pr$rotation)#PCA逆过程
>cbind(temp[,1]*pr$scale[1]+pr$center[1],temp[,2]*pr$scale[2]+pr$center[2],temp[,3]*pr$scale[3]+pr$center[3]) [,1][,2][,3]
1149.3 4.2108.1
2161.2 4.1114.8
3171.5 3.1123.2
4175.5 3.1126.9
5180.8 1.1132.1
6190.7 2.2137.7
7202.1 2.1146.0
8212.4 5.6154.1
9226.1 5.0162.3
10231.9 5.1164.3
11239.00.7167.6