# -*- mode: R -*-
##EmacsとESS使いの方に。モードラインの設定。
## 2018年10月 5日(金)
## source("snn.txt")
## source("pat.txt")
## source("snnWeight.txt")
## source("snnWeight1.txt")
## read.table("dat1.txt")



##前回のところで、「LDAの結果が変じゃん」の突っ込みを頂いたので、
##グラフ表示を改めた、asp=1を付けた。
## lda(rbind(dat1,dat2),rep(c("red","blue"),each=1000))->b
## plot(dat1,xlim=c(-10,10),ylim=c(-10,10),col="red",asp=1)
## points(dat2,col="blue")
## abline(0, (b$scaling)[2]/(b$scaling)[1])
## abline(0, -(b$scaling)[1]/(b$scaling)[2],col="blue")


##Sigmoid関数の微分
f <- function(x){ 1/(1+exp(-x)) }
x <- seq(-10,10,0.1)
plot( x, f(x), type="b" )
##Mathematicaの記号計算。
points( x, exp(x)/(exp(x)+1)^2, type="p", col="red")
##手でちょろっと計算。
ff <- function(x){ f(x)*(1-f(x))}
points( x, ff(x), type="l", col="green")

##Rに頼ると、、
D(expression(1/(1+exp(-x))), "x")
##Rの記号計算では、exp(-x)/(1 + exp(-x))^2
##分子分母にexp(x)を掛けると、Mathematicaに一致する
fff <- deriv( ~ 1/(1+exp(-x)), "x", func=T)
points( x, attr(fff(x), "gradient"), type="l", col="blue", lw=3)

##
setwd(choose.dir())
source("snn.txt")
source("pat.txt")##これは、dump()コマンドで作成した物、
## EIが677個、Nが1478個。全部で2155個。

##1番目と2番目の入力データは、CAGGTCTGTとAGGGTGAGC
BufInput[,1:2]
##       CAGGTCTGT AGGGTGAGC
##  [1,]         0         1
##  [2,]         0         0
##  [3,]         0         0
##  [4,]         1         0
##  [5,]         1         0
##  [6,]         0         0
##  [7,]         0         1
##  [8,]         0         0
## .............
## [35,]         0         0
## [36,]         0         1

##1番目と2番目の入力データは、EI境界のデータ
BufTarget[,1:2]
##      EI EI
## [1,]  1  1
## [2,]  0  0

##MAGGTRAGTに合致するパターンはどのくらいあるの??
dim(BufInput)
colnames(BufInput)##カラムの名前には、元の9塩基が入っている。

dim(BufTarget)      
colnames(BufTarget)##カラムの名前には、{EI,N}が入っている。
colnames(BufTarget)=="EI"##EIかをTrueかFalseかで返す。

##EIの9塩基
eiDat <- (colnames(BufInput))[colnames(BufTarget)=="EI"]
length(eiDat)
##MAGGTRAGTに合致するパターンはどれかね
grep("[AC]AGGT[AG]AGT", eiDat)
length(grep("[AC]AGGT[AG]AGT", eiDat))/length(eiDat)

##・数値化のルール
##Aを(1,0,0,0)、Tを(0,1,0,0)、
##Gを(0,0,1,0)、Cを(0,0,0,1)、
##EI境界を(1,0)、無関係な配列を(0,1)


#最後は、Cかどうか?。失敗すれば、rtvが戻る
zz<-function(x){if(x=="A"){rtv<-c(1,0,0,0)};
                if(x=="T"){rtv<-c(0,1,0,0)};
                if(x=="G"){rtv<-c(0,0,1,0)};
                if(x=="C"){rtv<-c(0,0,0,1)};
                rtv
}

zz("C")
zz("A")

#同様な関数
zz<-function(x){switch(EXPR = x,
                       "A"=c(1,0,0,0),
                       "T"=c(0,1,0,0),
                       "G"=c(0,0,1,0),
                       "C"=c(0,0,0,1))}
substring("ATCC",1:4,1:4)
sapply(substring("ATCC",1:4,1:4),zz)
##      A T C C
## [1,] 1 0 0 0
## [2,] 0 1 0 0
## [3,] 0 0 0 0
## [4,] 0 0 1 1

## #ニューラルネットワークのプログラム
## 余分についているのが、
## InputHidden$w <- matrix(runif(length(Input$net)*length(Hidden$net),min=-0.1,max=0.1), length(Hidden$net), length(Input$net));
## Hidden$bias   <- runif(length(Hidden$net),min=-1,max=1);
## など、重みの値を乱数で初期化する。でないと。。。。

length(Input$net)
#[1] 36
length(Hidden$net)
#[1] 15
length(Output$net)
#[1] 2

str(InputHidden)

(a <- activate(1))##1番目のデータをフォワードプロパゲーションし、
##Errorを返す。「関数の最後の評価項目」が、「関数の戻り値」になる約束。
##大外を()で囲むと、「代入した結果」を表示してくれる。
##[1] 0.07193615
##エラー(2乗誤差)は、0.07193615
Output
##$out
##[1] 0.6053621 0.3633256
##EIに対しては、0.6053621 
##N(EIじゃない)に対しては、0.3633256
target
target-Output$out
(target-Output$out)^2
sum((target-Output$out)^2)
sum((target-Output$out)^2)/2
sum((target-Output$out)^2)/2/2
a
#2乗誤差が0でない。

e<-c()
for(i in 1:100){
     learn(1);##1番目のデータを100回学習する
     e<-c(e,Error);
}

plot(log(e),type="b")

##うーん。完璧!!

hist(sapply(1:100, activate), nc=30)
#どのデータに対してもErrorが小さい。出来た!!
#本当?
sapply(800:900, activate)
#えっ。何故何故どうして??

plot(sapply(1:2155, activate))
range(sapply(1:2155, activate))

date()
e<-c()
for(i in 1:10){
  for(iSet in order(runif(2155))){
    learn(iSet);
    e<-c(e,Error);
  }
}
date()

##途中の学習している様子。
plot(log(e))

##エラーを行列の形式に変換。各列が、学習回(2155個の学習)のエラー学習回
##でのエラーの平均を求めよう。

m<-matrix(e,nrow=2155)
mean(m[,1])
mean(m[,10])
plot(log(apply(m,2,mean)), typ="b")


##300回の学習結果を貯めて、後で使う為には、
##上記のループの回数を300にして、
##dump(c("InputHidden","Hidden","HiddenOutput","Output"),"snnWeight.txt")
##でファイルに保存。
##c("InputHidden","Hidden","HiddenOutput","Output")##が設定される。

##3分間クッキング
source("snnWeight.txt")

##エラーの様子。
plot(sapply(1:2155, activate))
hist(sapply(1:2155, activate),nc=300)
hist(log(sapply(1:2155, activate)),nc=300)


##失敗例について、
(1:2155)[sapply(1:2155, activate)>0.1]

(dimnames(BufInput)[[2]])[231]
colnames(BufInput)[231]
##[1] "CTGGTTCTC"

(dimnames(BufInput)[[2]])[sapply(1:2155, activate)>0.1]
                          

##で、この時の中間層は、どうなっているの??

hOut <- c()
for(iSet in 1:2155){
   activate(iSet);
   hOut <- cbind(hOut, Hidden$out)
}

##中間層の出力を行列の形式で。行方向に15個の中間層の出力、列方向にデー
##タ。

dim(hOut)

##相関係数の計算cor
cor(hOut[1,],hOut[2,])

##ヒストグラム
hist(hOut)

#中間層の出力のデータに対する、レスポンス。平均と分散。
plot(apply(hOut,2,"mean"))
plot(apply(hOut,2,"var"))

cor(t(hOut))->a
a[row(a)-col(a)>0]
hist(abs(a[row(a)-col(a)>0]),nc=20)

#####これは、グラフ書き専用。
dat1 <- read.table("dat1.txt")
plot(dat1,xlim=c(-8.5,2.5),ylim=c(-5,6))

##判別分析のグラフ用。角度をいれると射影したグラフが出来る。
angle <- 30
poi <- function(angle){
  par(mfrow=c(1,2))
  plot(dat1,xlim=c(-8.5,2.5),ylim=c(-5,6),asp=1)
  abline(a=mean(dat1[,2])-tan(angle*pi/180)*mean(dat1[,1]),b=tan(angle*pi/180)
         ,col="blue")
  d <- matrix(c(cos(angle*pi/180),sin(angle*pi/180)),2,1)##回転行列の計算
  std <- sqrt(var(as.matrix(dat1) %*% d))
  hist( as.matrix(dat1) %*% d, breaks=seq(-10,10,0.2),
       main=sprintf("angel: %d, std: %5.3f",angle,std),
       ylim=c(0,150))
  std
}

stdS <- sapply(seq(0,180,1),poi);
par(mfrow=c(1,1))
plot(0:180, stdS)
##stdSの最小値を見つけるには、
min(stdS)

##最小の角度を見つけるには、、、
##orderは、要素を「小さいもの順」に並べたときに
##何番目の要素が一番小さいか?を返す。
order(stdS)
##[1]  30  31  29  32  28  33  27  34  26  35
##....
##[181] 120
##つまり、一番小さいのはstdSの30番目の要素で、一番大きいのはstdSの120番目の要素。
stdS
##  [22] 0.6556325 0.6424442 0.6307256 0.620...
##  [29] 0.5972873 0.5960847 0.5967842 0.599...
##  [36] 0.6278818 0.6391947 0.6520037 0.666...

##じゃあ、「角度の30番目」と「角度の120番目」の値は??
(0:180)[30]
(0:180)[120]
##で、
abline(v=((0:180)[c(30,120)]), col=c("red","green"))


##主成分分析
prcomp(t(hOut))->b

plot((b$sdev)^2,type="b")
abline(h=0)

#累積寄与率
plot(cumsum((b$sdev)^2/sum((b$sdev)^2)),type="b",ylim=c(0,1))

#クラスター解析
dist(hOut)

##2次元平面にプロット
library(MASS)
dat.cmd<-cmdscale(as.dist(1.0-cor(t(hOut))))
plot(dat.cmd, type="b", pch="")
text(dat.cmd[,1],dat.cmd[,2], 1:15)
     


#15個の中間層の出力の"距離"を総当たりで求める。
hclust(dist(hOut), method="average")
#それに基づいて、階層的クラスタリングを行う。
plot(hclust(dist(hOut), method="average"))

#相関係数の計算cor
cor(hOut[3,],hOut[7,])

##で、相関係数を"距離"と見立てて行うためには、
plot(hclust(as.dist(1.0-cor(t(hOut))), method="average"))



###中間層を1つにしてみる。
#学習結果
source("snnWeight1.txt")
Hidden
## $net
## [1] 49.89437
## $out
## [1] 1
drop(InputHidden$w)

barplot(drop(InputHidden$w),
   col=rep(c("red","green","blue","yellow"),9)
)

##エラーの様子。
plot(sapply(1:2155, activate))

(1:2155)[sapply(1:2155, activate)>0.1]

(dimnames(BufInput)[[2]])[231]
##[1] "CTGGTTCTC"

(dimnames(BufInput)[[2]])[sapply(1:2155, activate)>0.1]
                          


##オマケ、lda()でもやってみよう。
library(MASS)

i <- as.vector(t(BufInput))
im <- matrix(i,nr=2155)
t <- dimnames(BufTarget)[[2]]
b <- lda(im,t)
         
plot(b)
matplot(predict(b,im)$posterior)
hist(predict(b,im)$x)

b$scaling

barplot(-drop(b$scaling),
   col=rep(c("red","green","blue","yellow"),9))