## 2019年 9月27日(金曜日)
# -*- mode: R -*-
##EmacsとESS使いの方に、モードラインの設定。


## ##参照ファイルは、
## forPrint*.pdf
## command.txt
##・read.table
## read.table("dat1.txt")
##・source
## source("pat.txt")
## source("patnew.dat")


##強引に、全ての入力データをクラスタリングしてみる。
source("pat.txt")#これは、dump()コマンドで作成した物、
BufInput[,1]


##1つめのデータ
BufInput[,1]
## [1] 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 1 0 0
##2つめのデータ
BufInput[,2]
## [1] 1 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 1

##差の2乗は、
(BufInput[,1]-BufInput[,2])^2
##その総和、
sum((BufInput[,1]-BufInput[,2])^2)
##総和の最小値は、0.0。これが、ニューラルネットワークでは「2乗誤差」
sum((BufInput[,1]-BufInput[,1])^2)

##つまり、まったく同じ塩基配列だったら、距離は0。2つの塩基配列中に、異
##なる塩基が有れば、距離は増える。このとき、塩基の(違いの)種類によらな
##い。異なる塩基数だけが距離の元。


dim(BufInput)
##全部だと大変なので、200個に数を絞る。
sort(sample(1:2155, 10, replace = F))
## [1]  111  476  949  953 1042 1111 1171 1278 1768 1777
##100個に絞り込む
sss1 <- sample(1:677,    100, replace = F)##EIから50個
sss2 <- sample(678:2155, 100, replace = F)##Nから50個
sss <- c(sss1,sss2)##バランスよく?集めた。2段階
hc <- hclust(dist(t(BufInput[,sss])))
plot(hc)
##なんとなく、固まっている奴らがあるぞと。これらは、何だろう。。。
##ラベルを見ると判るのであるが、、、


##Hierarchical Clusteringの戻り値
hc$order##は、hcで並べた「葉っぱ」の順番を返してくれる。
(hc$order)[1]##左端はこれ。
##で、BufInputは、行列の形。
dim(BufInput)##36行、2155列。で、列方向に、9塩基の配列を入れてある。
(dimnames(BufInput))[[2]]##これで、2155個の元の配列が判る。
##面倒なのは、、「(hc$order)」は、選択した200個の中の番号。
(sss)##は選択した200個。
(sss)[(hc$order)[1]]##これで、一番左端の番号が判る。
##やっと、左端の配列が判った。
((dimnames(BufInput))[[2]])[(sss)[(hc$order)[1]]]
##面倒なのは、[]()が入り乱れている。
##書いているときは、ふんふんと判るのだが、時間が経つとなんだっけになる。
##コメントしかないですね。。。

##ここまで、前振り。
##階層クラスタリングのラベルにEIかそうでないかを示す。
##hc$orderで、デンドログラム中の葉の順番が判る。
dimnames(BufTarget)[[2]]##で、おまけに付けたEIかNかが判る。
((dimnames(BufTarget))[[2]])[sss]##上記で作った、選択基準を適用。sssはTもしくはF
(((dimnames(BufTarget))[[2]])[sss])[hc$order]##クラスタリングで並べられた順。
(((dimnames(BufTarget))[[2]])[sss])[hc$order]=="EI"##ラベルが、EIか??
ifelse((((dimnames(BufTarget))[[2]])[sss])[hc$order]=="EI","*","")##そうだったら*を。
plot(hc,
     labels=ifelse((((dimnames(BufTarget))[[2]])[sss])[hc$order]=="EI","x","+"))


##古典的多次元尺度構成法で、2次元平面にプロットする。
library(MASS)
dat.cmd<-cmdscale(dist(t(BufInput[,sss])))
plot(dat.cmd, type="b", pch="")
text(dat.cmd[,1],dat.cmd[,2], rep(c("x","+"), each=100),
     col=rep(c("red","blue"), each=100))
##なんかとっても、うまくいくぞ。ホントかな、、、
 
##こっからSVM
##こっからSVM

par(mfcol=c(1,2))
##表面積一定の立方体の体積は
x1 <- seq(0,2,by=0.02)
x2 <- x1
## 制約式:x1*x2+x2*x3+x3*x1=3より
## x3*(x1+x2)=3-x1*x2だから
## 体積を求める関数は、x1とx2を用いて
g <- function(x1,x2){x1*x2*(3.0-x1*x2)/(x1+x2)}
taiseki <- outer(x1,x2,g)
image(x1,x2,taiseki,col=terrain.colors(30))
##par(new=T)
contour(x1,x2,taiseki,add=T)
persp(x1,x2,taiseki,theta = 30, phi = 30)

##最小2乗法も、偏微分=0で解いてる
##もうちょっとList形式
par(mfcol=c(1,1))
dat1 <- read.table("dat1.txt")
plot(dat1)
## lsfit() による最小二乗法
##横軸(dat1$V1)をx、縦軸(dat1$V2)をyと見たてて。
## lsfit(x, y, wt=NULL, intercept=TRUE, tolerance=1e-07, yname=NULL)
lsfit(dat1$V1,dat1$V2)##ズラズラでてくる。注意してみると
## $rank
## [1] 2
## みたいな行が出てくる。。
## ので、一回貯めて、
poi <- lsfit(dat1$V1,dat1$V2)
##(昔のpencaseの例みたいに)、名前(name)を出してもらう。
names(poi)
##[1] "coefficients" "residuals"    "intercept"    "qr" 
##んと、"coefficients"とは、
poi$coefficients
## Intercept         X 
## -3.775132 -1.279451 
abline(coef=poi$coeffi, col="green", lwd=3)
##ここで、「coeffi」で良いのは、そこまで言えば一意にきまる。から。



##実際に2次計画問題を解いてみる。
##setwd(choose.dir())
install.packages("e1071", repos="http://cran.ism.ac.jp/")
##install.packages("e1071_1.6-7.zip", repos=NULL)
library(e1071)

##データの読み込み
source("patnew.dat")
##x:配列とy:ラベルが読み込まれる。
xx <- x
yy <- y
seqS <- rownames(x)##配列を保存する。
rownames(x) <- c()##rowNameを空にする。
y <- ifelse(y==1,"EI","N")##ラベルの付け替えをする。
y <- as.factor(y)##ファクタに変換する。
## Factor w/ 2 levels "EI","N": 1 1 1 1 1 1 1 1 1 1 ...

##svmの実行
model <- svm(x, y, kernel="linear")
summary(model)

##サポートベクターは、
str(model$index)
## int [1:79] 53 58 73 86 135 144 145 146 185 189 ...
## 2155個のうちで、79個が選ばれている。
a <- rep(0,2155)
a[model$index] <- 1
par(mfcol=c(1,1))
plot(a)
abline(v=677)##これから前は、EIで、後がN。
text(mean(c(0,677)), 0.5, "EI", cex=3)
text(mean(c(677,2155)), 0.5, "N", cex=3)
table(y,a)
##EIのデータから多く選ばれているな。と。
##     a
## y       0    1
##   EI  636   41
##   N  1440   38

##予測する。
pred <- predict(model, x)
##サポートベクターに選ばれた入力データはkernel="linear"の場合、平面なの
##で、ニューラルネットワークの1つのセルと同じ
table(pred, y)

par(mfcol=c(1,1))
barplot(apply(-model$SV, 2, mean),
        col=rep(c("red","green","blue","yellow"),9))

##予測と本当のラベルの予測率。
(aaa <- table(pred, y))
##対角要素が正解。
sum(diag(aaa))/sum(aaa)

##kernel関数の変更
poi <- function(k){
   model <- svm(x, y, kernel=k, coef0=1);
   summary(model);
   pred <- predict(model, x);
   aaa <- table(pred, y);
   plot(cmdscale(dist(x)),
     col = as.integer(y),##Labelで色を変える。
     pch = c(".","*")[seq(along=y) %in% model$index + 1]);
   title(sprintf("%s correct %2.2g%%", k, sum(diag(aaa))/sum(aaa)*100))
}

poi("linear")
poi("polynomial") ##coef0=1, dgree=3(default)
poi("radial")

1 ;

##cmdscaleは、要素間の違いに基づき、2次元平面に分布させる方法
#distは、要素間の距離行列を計算する。ユークリッド距離。
#col = as.integer(y)で、EI(黒)、N(赤)で各点を書く。
#pchで、各点に現れる文字を指定する。
#model$indexは、サポートベクトルを示す
#%in%等を使って、サポートベクトルか否かの区別をしている。match(x, table)