## -*- mode: R -*-
## 2018年10月19日(金)
##EmacsとESS使いの方に、モードラインの設定。

## ##参照ファイルは、
## forPrint*.pdf
## command.txt
## ・source
## source("snn.txt")
## ・read.table
## splice.All <- read.table("splice.dat",header=TRUE)
## dat1 <- read.table("dat1.txt")
## dat2 <- read.table("dat2.txt")
## ・install
## install.packages("tree_1.0-37.zip", repo=NULL)
############

##plot(sin)
##setwd(choose.dir())

maru <- matrix(c(13, 72,
                 11, 35,
                 37, 57,
                 23, 41,
                 26, 21,
                 40, 68,
                 43, 50,
                 68, 75,
                 89, 77),
               ncol=2 ,byrow=T)
batsu <- matrix(c(68, 50,
                  77, 43,
                  88, 46,
                  62, 27,
                  78, 29,
                  62, 31,
                  55, 13,
                  89, 16),
                ncol=2 ,byrow=T)


##svmのマージン最大化の説明
plot(rbind(maru,batsu),type="n")
##pch=21は色ありの●。
points(maru[,],pch=21,col=1,cex=2,lw=3,bg="orange")
points(batsu[,],pch=4,col=2,cex=2,lw=3)

col2rgb("orange")/255
plot(rbind(maru,batsu),type="n")
##21は色ありの●。
for( i in 1:1000 ){
  points(maru[,]+5*rnorm(length(as.numeric(maru))),
         col=NULL, bg=rgb(1, 0.64, 0, alpha=0.1), pch=21, cex=2)
}
points(maru[,],pch=21,col=1,cex=2,bg="orange",lw=3)

for( i in 1:1000 ){
  points(batsu[,]+5*rnorm(length(as.numeric(batsu))),
         pch=4,col=1,cex=2)
}
points(batsu[,],pch=4,col=2,cex=2,lw=2)


##お次の話題、の前にと。
##maru,batsuを書きたい。
##まとめて書くと、maruとbatsuの区別がつかない。
plot(rbind(maru,batsu))

##par(new=T)のテクを使うと、maruとbatsuの区別がつくが軸が変になる。
plot(maru, pch=1)
par(new=T)
plot(batsu, pch=2)

##軸の値を同じにするために、
plot(maru, pch=1)
str(maru)
text(maru, as.character(1:9),col=1, cex=3)
points(batsu, pch=2)
text(batsu, as.character(1:8),col=2, cex=3)
str(batsu)##8点有ったのに。。。

##まとめて書く方法。
##①枠だけ描いて、pointsで点を打つ。
plot(rbind(maru,batsu), pch="")
points(maru, pch=1, type="b")
str(maru)
text(maru, as.character(1:9),col=1, cex=3)
points(batsu, pch=2, type="b")
text(batsu, as.character(1:8),col=2, cex=3)

##②pchを判って書き分ける。
plot(rbind(maru,batsu), pch=rep(c(1,2), times=c(9,8)))
text(maru, as.character(1:9),col=1, cex=3)
text(batsu, as.character(1:8),col=2, cex=3)



##setwd(choose.dir());getwd()
##NNを使って認識面を出してみよう
source("snn.txt")
##学習データを手で作る。
BufInput <- t(rbind(maru, batsu))
dim(BufInput)## 2x17の行列。
BufTarget <- t(rbind(matrix(rep(c(1,0),9), ncol=2, byrow=T),
                     matrix(rep(c(0,1),8), ncol=2, byrow=T)))
dim(BufTarget)## 2x17の行列。
BufTarget

date()
e<-c()
for(i in 1:100){
  for(iSet in sample(17)){
    learn(iSet);
    e<-c(e,Error);
  }
}
date()
plot(e)
plot(sapply(1:17, activate))

plot(t(BufInput))## 2x17の行列。

apply(t(BufInput), 2, range)
##x1の範囲は、11~89．x2の範囲は、13~77．


## activate <- function(iset){
## #foward propagation;
##     Input$out <<- BufInput[,iset];
##     ...
##   }
##なので、いかさまに、BufInput[,iset]に値を突っ込む。


##全ての領域でどのような結果になるか??
tmpS <- c()
for( x1 in 11:89 ){
  tmp <- c()
  for( x2 in 13:77 ){
    BufInput[,1] <- c(x1,x2)
    activate(1)
    tmp <- c(tmp, (Output$out)[1])
  }
  tmpS <- rbind(tmpS, tmp)
}
str(tmpS)
image(11:89, 13:77,  (tmpS), col=terrain.colors(10))
contour(11:89, 13:77,  (tmpS), add=T)##重ねがき。
points(rbind(maru,batsu), pch=rep(c(1,2), times=c(9,8)))
text(maru, as.character(1:9),col=1, cex=3)
text(batsu, as.character(1:8),col=2, cex=3)


##X1軸だけの値を頼りに分割
hist(rbind(maru,batsu)[,1])
##plot=Fにしておくと、ヒストグラムを書かない。Binだけ求める。
brks <- hist(rbind(maru,batsu)[,1],plot=F)$breaks
brks
range(rbind(maru,batsu)[,1])

a <- hist(rbind(maru,batsu)[,1], breaks=brks,plot=F)
##type="n"にしておくと、枠だけ。データの線を書かない
plot(a$mids, a$density, ylim=c(0, 0.04),type="n")
##a$midsはBinの中心。a$densityは頻度の密度表示
a <- hist(maru[,1], breaks=brks, plot=F)
lines(a$mids, a$density, type="b",col="orange",lw=3)
a <- hist(batsu[,1], breaks=brks, plot=F)
lines(a$mids, a$density, type="b", col=1, lw=3)

     
##Rで眺める
##○と×を一つにまとめた。
mb <- rbind(cbind(maru,1,0),cbind(batsu,0,1))

##mb
##       [,1] [,2] [,3] [,4]
##  [1,]   13   72    1    0
##  [2,]   11   35    1    0
##  [3,]   37   57    1    0
## ...
## [15,]   62   31    0    1
## [16,]   55   13    0    1
## [17,]   89   16    0    1

##X1の要素を、小さい順に並べ替えると
sort(mb[,1])
##そう、全部で高々17個(全データの個数)。
##それぞれの間が閾値になり得るので、閾値候補としては、16個
head(sort(mb[,1]),16)+diff(sort(mb[,1]))/2

##sortしたときに何番目に来るか??
order(mb[,1])
##その順に並べてみると、「x1の値が、小さい順」
mb[order(mb[,1]),]
##端から順に足してみる。
cumsum((mb[,3])[order(mb[,1])])##○
cumsum((mb[,4])[order(mb[,1])])##×
##cf::cumsum(1:10)
##cumulate 積み重ねる,積み上げる,集積する;累加する.

##maruの個数が閾値に従って、どの様に変化するか??
plot(sort(mb[,1]),cumsum((mb[,3])[order(mb[,1])]))
##batsuの個数が閾値に従って、どの様に変化するか??
plot(sort(mb[,1]),cumsum((mb[,4])[order(mb[,1])]))
##両方まとめて書いてみる
matplot(c(head(sort(mb[,1]),16)+diff(sort(mb[,1]))/2,max(mb[,1])+1),
        cbind(cumsum((mb[,3])[order(mb[,1])]),
              cumsum((mb[,4])[order(mb[,1])])),
        pch=c("o","x"),
        type="b"
        )
##X2でやってみる。
matplot(c(head(sort(mb[,2]),16)+diff(sort(mb[,2]))/2,max(mb[,2])+1),
        cbind(cumsum((mb[,3])[order(mb[,2])]),
              cumsum((mb[,4])[order(mb[,2])])),
        pch=c("o","x"),
        type="b"
        )

##???はて、どこで区切れば良いのだろうか??X1軸、X2軸??
##天下り的に情報量
ent <- function(x){ if(x==0.0){ 0.0 } else {  -x*log(x) }}
plot(seq(0,1,0.1), 
     sapply(seq(0,1,0.1), ent)+
     sapply(1.0-seq(0,1,0.1), ent), type="b")

##この関数は、(閾値よりも左側の○の数、閾値よりも左側の×の数)が入力
poi <- function(x){
  m <-x[1]##閾値よりも左側の○の数
  b <-x[2]##閾値よりも左側の×の数
  if(m+b==0){
    ans <- 0;
  } else {
    ##左側での(○率、×率)
    pm <- m/(m+b)
    pb <- b/(m+b)
    ##左側には、(m+b)個有るから、
    ans <- (m+b)*(ent(pm)+ent(pb))
  }
  ##右側も同様にして考える。
  m <- 9-x[1]##閾値よりも右側の○の数
  b <- 8-x[2]##閾値よりも右側の×の数
  if(m+b==0){
    ans;
  } else {
    pm <- m/(m+b)
    pb <- b/(m+b)
    ans <- ans + (m+b)*(ent(pm)+ent(pb))
  }
  return(ans)
}
poi(c(1,8))
poi(c(4,4))

mb
##     [,1] [,2] [,3] [,4]
## X1属性値、X2属性値、○だったら1、×だったら1
## [1,]   13   72    1    0
## [2,]   11   35    1    0
## ...
## [16,]   55   13    0    1
## [17,]   89   16    0    1

mb[,3]##○であれば、1．
## 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
cumsum(mb[,3])##「自分の値以下」の○の数。
## 1 2 3 4 5 6 7 8 9 9 9 9 9 9 9 9 9

mb[,1]##X1属性の値。
order(mb[,1])##「X1属性値」をsortすると、何番目になるか?
(mb[,1])[order(mb[,1])]##「X1属性値」をsortした結果と同じ。
sort(mb[,1])##「X1属性値」をsortすると、何番目になるか?

##「自分の値以下」の○の数。×も含むからこうなる。
cumsum((mb[,3])[order(mb[,1])])

z <- cbind(cumsum((mb[,3])[order(mb[,1])]),
           cumsum((mb[,4])[order(mb[,1])]))
z
##       [,1] [,2]
##「X1属性値」で、
##「自分の値以下」の○の数、×の数
##  [1,]    1    0
##  [2,]    2    0
## ...
## [16,]    9    7
## [17,]    9    8

##これは、このまま素直にさっきの関数に適用できる。
plot(sort(mb[,1]),apply(z, 1, poi),type="b")

z <- cbind(cumsum((mb[,3])[order(mb[,2])]),
           cumsum((mb[,4])[order(mb[,2])]))

plot(sort(mb[,2]),apply(z, 1, poi),type="b")
     
poi  
############################
######  Treeパッケージの準備
############################
## [パッケージ]>>[ローカルにあるZIPファイルから]
## ***.zipを選択
## [パッケージ]>>[パッケージの読み込み]
## treeを選択
## 
## ##コマンドラインでで実行するには、utils:::menuInstallLocal()
## install.packages("tree_1.0-25.zip", repo=NULL)
##setwd(choose.dir())
install.packages("tree_1.0-37.zip", repo=NULL)
library(tree)
##setwd(choose.dir())

############################
######  決定木の説明  ######
############################
##splicingのデータ。
splice.All <- read.table("splice.dat",header=TRUE)
dim(splice.All)##3181個のデータ、塩基の数は、19。先頭はラベル({EI,IE,N})
summary(splice.All)
       
##がつがついかない。ラベルが{EI,N}を選んで使おう。
splice <- splice.All[splice.All$label=="EI"|splice.All$label=="N",]
dim(splice)##2415個のデータ、20塩基の数は、19。先頭はラベル({EI,IE,N})
summary(splice)
##大体の様子が判る。
##  label     s1      s2      s3      s4      s5      s6   ...
##  EI: 764   A:619   A:571   A:614   A:611   A:581   A:675...
##  IE:   0   C:641   C:606   C:588   C:582   C:645   C:702...
##  N :1651   G:614   G:649   G:659   G:603   G:661   G:572...
##            T:541   T:589   T:554   T:619   T:528   T:466...

splice.tree <- tree(label ~ ., data=splice)
##「spliceデータで、labelを予測してね」。「.」は、 label以外の属性値を用いて予測する
##「当て物関数」での、お約束。

plot(splice.tree)
text(splice.tree)

str(splice.tree)
##node), split, n, deviance, yval, (yprob) * denotes terminal node
##1) root 2415 3014.00 N ( 0.316356 0.000000 0.683644 )  
##  2) s10: A,C,G 1238   86.42 N ( 0.005654 0.000000 0.994346 ) *
##  3) s10: T 1177 1534.00 EI ( 0.643161 0.000000 0.356839 )  
##    6) s9: A,C,T 322    0.00 N ( 0.000000 0.000000 1.000000 ) *
##    7) s9: G 855  608.90 EI ( 0.885380 0.000000 0.114620 )  
##     14) s13: A,C,T 190  260.40 EI ( 0.563158 0.000000 0.436842 )  
##       28) s8: A,C,T 63   24.12 N ( 0.047619 0.000000 0.952381 ) *
##       29) s8: G 127  120.20 EI ( 0.818898 0.000000 0.181102 ) *
##     15) s13: G 665  143.40 EI ( 0.977444 0.000000 0.022556 )  
##       30) s12: A 488   14.38 EI ( 0.997951 0.000000 0.002049 ) *
##       31) s12: C,G,T 177   97.90 EI ( 0.920904 0.000000 0.079096 )  
##         62) s8: A,C,T 23   31.84 N ( 0.478261 0.000000 0.521739 ) *
##         63) s8: G 154   21.35 EI ( 0.987013 0.000000 0.012987 ) *

##予測と実際の違い。
splice.prd <- predict(splice.tree, newdata=splice, type="class")

##予測と本当のラベル
table(splice.prd,splice$label)

#############################
##全部のラベルでやってみよう。
#############################
splice.All <- read.table("splice.dat",header=TRUE)
splice <- splice.All##上書きして、全ラベル({EI,IE,N})使う。
splice.tree <- tree(label ~ ., data=splice)
plot(splice.tree)
text(splice.tree)
splice.tree

##予測
splice.prd <- predict(splice.tree, newdata=splice, type="class")

##予測と本当のラベル
table(splice.prd,splice$label)

##これでは、細かくならない。
splice.tree <- tree(label ~ ., data=splice, mincut=1, minsize=2)
##これでも、細かくならない。
splice.tree <- tree(label ~ ., data=splice, mindev=0.00000000001)
##これで、細かく出来る。
splice.tree <- tree(label ~ ., data=splice, mincut=1, minsize=2, mindev=0)

##これで、細かく出来る。
splice.tree <- tree(label ~ ., data=splice, minsize=1, mindev=0)
splice.prd <- predict(splice.tree, newdata=splice, type="class")

plot(splice.tree,type="uni")
text(splice.tree)

##予測と本当のラベル
table(splice.prd,splice$label)


###些細なルールを作ってしまうと
##7割のデータを選んで、学習。3割のデータで、テスト。
lrn <- sort(sample(1:length(splice.All[,1]),length(splice.All[,1])*0.7))

splice <- splice.All[lrn,]
##summary(splice)
##splice.tree <- tree(label ~ ., data=splice)
splice.tree <- tree(label ~ ., data=splice, minsize=1, mindev=0)
##type="uni"で枝の長さを揃える
plot(splice.tree, type="uni"); text(splice.tree)

##3割のデータでテスト
splice.prd <- predict(splice.tree, newdata=splice.All[-lrn,], type="class")
##予測と本当のラベル
table(splice.prd,(splice.All[-lrn,])$label)

##ちょっとだけ、[-lrn]の使い方。
seq(10,100,by=10)
##5番目、3番目、8番目を選ぶ。
(seq(10,100,by=10))[c(5,3,8)]
##5番目、3番目、8番目を含まない物全部。
(seq(10,100,by=10))[-c(5,3,8)]
##つまり、splice.All[lrn,]番目を学習に用いて、splice.All[-lrn,]を検査に使う

a <- table(splice.prd,(splice.All[-lrn,])$label)
##対角線要素以外の総和
row(a)
col(a)
##対角線要素の抽出
a[row(a)==col(a)]
##対角線要素以外の抽出
a[row(a)!=col(a)]
##エラーの割合は、
sum(a[row(a)!=col(a)])/sum(a)

##もしくは、対角要素を使って正解率。
diag(a)
sum(diag(a))/sum(a)
    

splice.tree <- tree(label ~ ., data=splice, minsize=1, mindev=0)
plot(splice.tree); text(splice.tree)
##splice.tree
##残りでテスト
splice.prd <- predict(splice.tree, newdata=splice.All[-lrn,], type="class")
##予測と本当のラベル
table(splice.prd,(splice.All[-lrn,])$label)


##植木屋さん。Trivialな枝から、切っていく。
lrn <- sort(sample(1:length(splice.All[,1]),length(splice.All[,1])*0.7))
splice <- splice.All[lrn,]
splice.tree <- tree(label ~ ., data=splice, minsize=10, mindev=0.001)
summary(splice.tree)

a <- table(splice.prd,(splice.All[-lrn,])$label)
##エラーの割合は、
sum(a[row(a)!=col(a)])/sum(a)


##んじゃ、どのくらいのサイズが良いのだろうか、虱潰し
splice <- splice.All
a.tree <- tree(label ~ ., data=splice, minsize=1, mindev=0)
errOpen  <- c()
errClose <- c()
for(ts in 50:2){
  a.tree <- prune.tree(a.tree, best=ts) ##枝刈りして、全体大きさがtsの物を作る。
  ##どの位出来ているかテスト。
  splice.prd <- predict(a.tree, newdata=splice[-lrn,], type="class")
  a <- table(splice.prd,(splice[-lrn,])$label)
  ##対角線要素以外の総和
  errOpen <- c(errOpen,sum(a[row(a)!=col(a)])/sum(a))

  splice.prd <- predict(a.tree, newdata=splice[lrn,], type="class")
  a <- table(splice.prd,(splice[lrn,])$label)
  ##対角線要素以外の総和
  errClose <- c(errClose,sum(a[row(a)!=col(a)])/sum(a))
}
##1:Open:未学習データ。2:Close:学習データ。
matplot(50:2, cbind(errOpen,errClose), type="b")

##「どのくらいの大きさの木が良いのだろうか」をお任せで
##cv.tree(splice.tree,FUN=prune.tree)
seq(splice.All[,1])##てやると、1から始まる、長さがsplice.All[,1]の番号列が得られる。
lrn <- sort(sample(seq(splice.All[,1]),length(splice.All[,1])*0.7))
##sampleは、「与えられたベクトル」から、n個を選ぶ。
splice <- splice.All[lrn,]
dim(splice.All)
dim(splice)
dim(splice.All)*0.7

splice.tree <- tree(label ~ ., data=splice, minsize=10, mindev=0.001)
##細かい決定木を作って。

splice.cv <- cv.tree(splice.tree,FUN=prune.tree)
##10Foldのクロスバリデーションを行う。
cvDevS <- c()
for(i in 1:10){##それを、10回繰り返す。
  cvDevS <- cbind(cvDevS, cv.tree(splice.tree,, prune.tree)$dev)
}
dim(cvDevS)
plot(splice.cv$size, apply(cvDevS,1,mean),type="b")
matplot(splice.cv$size, cvDevS,type="b",log="y")
splice.cv$size##なんか途中欠けているな、、、

##identify(splice.cv$size, apply(cvDevS,1,mean),splice.cv$size)
##右クリックで終了

##サイズが12が良さそう。#で木のサイズを12にしてみる
lrn <- sort(sample(1:length(splice.All[,1]),length(splice.All[,1])*0.7))
splice <- splice.All[lrn,]
splice.tree <- tree(label ~ ., data=splice, minsize=10, mindev=0.001)
##これで、枝狩り。。
splice.tree <- prune.tree(splice.tree,best=12)
splice.prd <- predict(splice.tree, newdata=splice.All[-lrn,], type="class")
table(splice.prd,(splice.All[-lrn,])$label)
plot(splice.tree); text(splice.tree)

splice.prd <- predict(splice.tree, newdata=splice.All[-lrn,], type="class")
table(splice.prd,(splice.All[-lrn,])$label)


##判別分析との比較の為に
dat1 <- read.table("dat1.txt")
dat2 <- read.table("dat2.txt")

####dat2 <- dat2+c(-2,0)
dat2[,1]<- dat2[,1]-3

dat <- cbind(rbind(dat1,dat2),rep(c("red","blue"),c(1000,1000)))

##非常に細かい決定木
dat.tree <- tree(dat[,3] ~ dat[,1]+dat[,2], minsize=1, mindev=0)

plot(dat.tree)
text(dat.tree)

plot(dat[,1], dat[,2], type="n")
text(dat[,1], dat[,2], c("b", "r")[dat[,3]], col=c("blue", "red")[dat[,3]])
partition.tree(dat.tree, add = TRUE, cex = 1.5)

table(predict(dat.tree, type="class"), dat[,3])

##おまかせ決定木
dat.tree <- tree(dat[,3] ~ dat[,1]+dat[,2])
table(predict(dat.tree, type="class"), dat[,3])
plot(dat[,1], dat[,2], type="n")
text(dat[,1], dat[,2], c("b", "r")[dat[,3]], col=c("blue", "red")[dat[,3]])
partition.tree(dat.tree, add = TRUE, cex = 1.5)

##判別分析
library(MASS)
dat.lda <- lda(dat[,3] ~ dat[,1]+dat[,2])
table(predict(dat.lda, dat)$class,dat[,3])


1