# -*- mode: R -*-
##EmacsとESS使いの方に。モードラインの設定。
##2012年 7月 5日(金曜日)

##plot(1:10)
##setwd(choose.dir())

#########################
######  HMMの説明  ######
#########################

##sample(x, size, replace = FALSE, prob = NULL)
##配列xから、sizeを取り出す。配列の各要素が取り出される確率prob。

##barplot(c(0.1,0.7,0.2),col=c("yellow","blue","red"),names.arg=1:3)
##それぞれのボールの出現確率が、(0.1,0.7,0.2)
##「ボール取り出しの試行」を10000回繰り返した時の、出現頻度。

poi <- c()
for (i in 1:10000) {
  state <- sample(1:3, 1, prob=c(0.1,0.7,0.2))
  poi <- c( state, poi )
}
hist(poi)

        
## 1:不安定状態。30%の確率で安定状態に変化する(green)
## 2:安定状態。20%の確率で不安定状態に変化する(red)

###大気状態の遷移関数。
###遷移行列。tpm (transition probability matrix)

tpm <- matrix(c(0.7, 0.3,
                0.2, 0.8),2,2,byrow=TRUE)
tpm[1, ]
#[1] 0.7 0.3
##1は「安定状態」。
##次の状態は、0.7の確率で安定状態(1)に留まる、0.3の確率で不安定状態(2)に変化。

tpm[2, ]
#[1] 0.2 0.8
##2は「不安定状態」。
##次の状態は、0.2の確率で安定状態(1)に変化、0.8の確率で不安定状態(2)に留まる。

barplot(t(tpm),col=c("green","red"),names.arg=c("stable","unstable"))


history <- c()
nState <- ncol(tpm) ### 状態の数
state <- 1 ##最初は、安定からStart
for (i in 1:10000) {
  ##次のステートを決める。
  ##sample(x, size, replace = FALSE, prob = NULL)
  state <- sample(1:nState, 1, prob = tpm[state, ])###状態からの変遷
  ##probは、選択する確率。
  history <- c( state, history )
}
###各状態にどの位の確率で居るか
hist(history)

###状態遷移の履歴
plot(history, xlim=c(1,30), type="b")
title("history of transition")

###どの位続けて1の状態にいるか
len <- 0
lenhistory <- c()
for( i in 1:length(history)){
  if(history[i]==1){
    len <- len+1;
  } else {
    if(len!=0){
      lenhistory <- c(len, lenhistory)
    }
    len <- 0;
  }
}
  
hist(lenhistory, breaks=range(lenhistory)[2]+1)
a <- hist(lenhistory, breaks=range(lenhistory)[2]+1)
plot(a$counts, log="y")
title(sprintf("log(counts at state %d)",1))
##a$counts[-1]/a$counts[-length(a$counts)]
##par(new=T)
##plot(0.4^(0:20)*600,type="b")

###どの位続けて2の状態にいるか
len <- 0
lenhistory <- c()
for( i in 1:length(history)){
  if(history[i]==2){
    len <- len+1;
  } else {
    if(len!=0){
      lenhistory <- c(len, lenhistory)
    }
    len <- 0;
  }
}
1;
hist(lenhistory, breaks=range(lenhistory)[2]+1)

##でと、外から観測できるのは、晴れ1、曇り2、雨3。ラベル。
##ラベルの出現頻度は、大気の状態によって変わるはず
###「遷移行列」じゃ、無い。

Rho <- matrix(c(0.7, 0.2, 0.1,
                0.1, 0.4, 0.5), 3,2,byrow=FALSE)

## 安定状態:晴れ(0.7)、曇り(0.2)、雨(0.1)
## 不安定状態:晴れ(0.1)、曇り(0.4)、雨(0.5)

##コマンドでHMMパッケージの準備
##install.packages("hmm.discnp", repo="http://cran.md.tsukuba.ac.jp", dependencies=TRUE)
library(hmm.discnp)

##連続9日間の天気
##sim.hmm(nsim,tpm,Rho,ispd=NULL,yval=NULL,verb=FALSE)
y.sim <- sim.hmm(rep(9,10),tpm,Rho,ispd=c(1,0))
y.sim
## [[1]]
## [1] 1 1 2 3 1 2 1 2 2
## [[2]]
## [1] 1 3 1 1 1 2 1 3 3
##.......
## [[10]]
## [1] 1 1 1 1 1 3 2 3 3

##「100年分の夏休みの宿題」データより推定
y.sim <- sim.hmm(rep(40,100),tpm,Rho,ispd=c(1,0))
##MarkovModelのステートの数Kが2だとすると。
try <- hmm(y.sim,K=2,verb=T,stationary=T)
try$Rho
try$tpm

barplot(t(rbind(tpm,try$tpm)),
        names.arg=t(outer(c("true", "estimated"),1:2,paste)),
        col=c("green","red"))

barplot(cbind(Rho,try$Rho),
        names.arg=t(outer(c("true", "estimated"),1:2,paste)))


1;

##MarkovModelのステートの数Kが4だとすると。
try <- hmm(y.sim,K=4,verb=T,stationary=T)
t(try$tpm)
try$Rho

##いかさまに、状態、2，3，4をまとめてみる。
barplot(t(try$tpm),
        names.arg=paste("estimated",1:4))
tmp <- t(try$tpm)
apply(t(try$tpm)[2:4,],2,sum)
tmp[2,] <- apply(t(try$tpm)[2:4,],2,sum)
tmp[3:4,] <- 0
tmp

tmp[,2] <- apply(tmp[,2:4],1,mean)
tmp[,3:4] <- 0
tmp
barplot(tmp[1:2,1:2],
        names.arg=paste("estimated",1:2))


barplot(cbind(t(tpm),tmp[1:2,1:2]),
        names.arg=t(outer(c("true", "estimated"),1:2,paste)),
        col=c("green","red"))
##なんと無く似ている??


##状態数4のラベルの出現頻度は、???。
barplot(try$Rho,names.arg=paste("estimated",1:4))

##いかさまに、状態、2，3，4をまとめてみる。
tmp <- try$Rho
tmp[,2] <- apply(tmp[,2:4],1,mean)
tmp[,3:4] <- 0
tmp

##ラベルの出現頻度は、割と似ている??
barplot(cbind(Rho,tmp[,1:2]),
        names.arg=t(outer(c("true", "estimated"),1:2,paste)))

barplot(c(0.1,0.4,0.5),col=c("yellow","blue","red"),names.arg=1:3)
barplot(c(0.8,0.1,0.1),col=c("yellow","blue","red"),names.arg=1:3)
barplot(c(0.3,0.3,0.4),col=c("yellow","blue","red"),names.arg=1:3)
        
##他のモデルに当てはめると、どうなるか??
##受理の確率を知りたい。。。
y.sim <- sim.hmm(rep(40,100),tpm,Rho,ispd=c(1,0))

s.vit <- viterbi(y.sim, tpm=tpm, Rho=Rho, ispd=c(1,0))
s.vit##状態のリスト。ispd=c(1,0)なので、すべての配列は状態1から始まる。
rownames(Rho) <- 1:3 ##no 'dimnames' attribute for arrayは、これだった。
mean(log(pr(s.vit, y.sim, tpm=tpm, Rho=Rho)))
## -43.22726

###
tpm.uso <- matrix(c(0.5, 0.5,
                0.5, 0.5),2,2,byrow=TRUE)
Rho.uso <- matrix(c(0.3, 0.3, 0.4,
                0.3, 0.4, 0.3), 3,2,byrow=FALSE)

s.vit <- viterbi(y.sim, tpm=tpm.uso, Rho=Rho.uso, ispd=c(1,0))
s.vit##状態のリスト。ispd=c(1,0)なので、すべての配列は状態1から始まる。
rownames(Rho.uso) <- 1:3 
mean(log(pr(s.vit, y.sim, tpm=tpm.uso, Rho=Rho.uso)))
## -24.34478


##やりたいことは、EIのデータを学習できる様にしたい。
##setwd(choose.dir())

## state1->state2は、0.5, と、state1->state5も、0.5。
##     次の位置>> 1    2    3    4    5    6    7    8
tpm <- matrix(c(0.0, 0.5, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, ##1<<元の位置
                0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ##2
                0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ##3
                0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ##4
                0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ##5
                0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ##6
                0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ##7
                0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0), ##8
              8,8,byrow=TRUE)

barplot(t(tpm),col=1:8, names.arg=paste("state",1:8))
legend("bottomright",
       legend=paste("state",1:8),
       col=1:8,
       pch=22,##filled square, 
       lty=0,##線分の形式 (line type) を無し(透明の線)にする．こうすると、PCHだけになる。
       pt.bg=1:8,
       pt.cex=2,##expansion factor(s) for the points.
       )
title("transition probability matrix")
jpg.copy(sprintf("D:/oracle/howm/2013/04/img/add-1.jpg"),force=0)
##[[../../2013/04/img/add-1.jpg]]

##               1A   2T   3G   4C   5s   6e
a <- 0.2
Rho <- matrix(c(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ##1
                1/4-a, 1/4-a, 1/4+a, 1/4+a, 0.0, 0.0, ##2 ストリーム①GCリッチ
                1/4-a, 1/4-a, 1/4+a, 1/4+a, 0.0, 0.0, ##3 ストリーム①GCリッチ
                1/4-a, 1/4-a, 1/4+a, 1/4+a, 0.0, 0.0, ##4 ストリーム①GCリッチ
                1/4+a, 1/4+a, 1/4-a, 1/4-a, 0.0, 0.0, ##5 ストリーム②ATリッチ
                1/4+a, 1/4+a, 1/4-a, 1/4-a, 0.0, 0.0, ##6 ストリーム②ATリッチ
                1/4+a, 1/4+a, 1/4-a, 1/4-a, 0.0, 0.0, ##7 ストリーム②ATリッチ
                0.0, 0.0, 0.0, 0.0, 0.0, 1.0),##8
              6,8,byrow=FALSE)

barplot(Rho,
        col=c("red","green","blue","yellow","white","grey"),
        names.arg=paste("state",1:8))
title("Rho")
legend("bottomright",
       legend=c("A","T","G","C","s","e"),
       col=c("red","green","blue","yellow","white","grey"),
       pch=22,##filled square, 
       lty=0,##線分の形式 (line type) を無し(透明の線)にする．こうすると、PCHだけになる。
       pt.bg=c("red","green","blue","yellow","white","grey"),##the background color for the points, corresponding to its argument bg
       pt.cex=2,##expansion factor(s) for the points.
       )
       ##凡例間の距離x.intersp=2,
       ##凡例間の距離y.intersp=1.5)
jpg.copy(sprintf("D:/oracle/howm/2013/04/img/add-2.jpg"),force=0)
##[[../../2013/04/img/add-2.jpg]]


##y <- sim.hmm(rep(5,10), tpm,  Rho, yval=c("A","T","G","C","s","e"))
y <- sim.hmm(rep(5,1000), tpm, ispd=c(1.0, rep(0,7)), Rho, yval=c("A","T","G","C","s","e"))

## L <- list(hiris, hiris) # 2つのデータフレームからなるリスト
## lapply(L, "[", 1, 1) # 成分データフレームの第一行第一列をリストとして取り出し
##unlist(lapply(y,"[", 2:4))

## リストの行列化
##sim <- lapply(y,"[", 2:4)
sim <- do.call("rbind", y)
table(sim)
sum(table(sim))

##2種類の塩基配列が出来ているか??
##sim[3,2:4]
##試しに、ATなら-1点,GCなら1点。合計が正ならGCリッチ。
zz<-function(x){switch(EXPR = x,
			"A"=-1,
			"T"=-1,
			"G"=1,
			"C"=1,
                       0)}##ATGC以外の場合は0
##mean(sapply(sim[3,2:4],zz))
###mean(sapply(c("G","G","A"),zz))##これが、0を越えるとGCリッチ
zzz<-function(xx){mean(sapply(xx,zz))}
##zzz(c("G","G","A"))

apply(sim,1,zzz)
hist(apply(sim,1,zzz))
jpg.copy(sprintf("D:/oracle/howm/2013/04/img/add-3.jpg"),force=0)
##[[../../2013/04/img/add-3.jpg]]

##学習できるか??
##yが学習用のデータ。

##適当なtpm,Rhoを作って貰う。これをちょっと後で使う。
par0tmp <- init.all(nval=6, K=8, rand.start=list(tpm = FALSE, Rho = FALSE), mixture=FALSE)
##殆ど正解の所から、出発する。
try <- hmm(y, yval=c("A","T","G","C","s","e"),
           par0=list(tpm=(0.9*tpm+0.1*par0tmp$tpm),Rho=(0.9*Rho+0.1*par0tmp$Rho)),
           verb=T,stationary=T)
barplot(t(try$tpm),col=1:8, names.arg=paste("state",1:8))
title("transition probability matrix")
legend("bottomright",
       legend=paste("state",1:8),
       col=1:8,
       pch=22,##filled square, 
       lty=0,##線分の形式 (line type) を無し(透明の線)にする．こうすると、PCHだけになる。
       pt.bg=1:8,
       pt.cex=2,##expansion factor(s) for the points.
       )

jpg.copy(sprintf("D:/oracle/howm/2013/04/img/add-4.jpg"),force=0)
[[../../2013/04/img/add-4.jpg]]

barplot(try$Rho,
        col=c("red","green","blue","yellow","white","grey"),
        names.arg=paste("state",1:8))
title("Rho")
legend("bottomright",
       legend=c("A","T","G","C","s","e"),
       col=c("red","green","blue","yellow","white","grey"),
       pch=22,##filled square, 
       lty=0,##線分の形式 (line type) を無し(透明の線)にする．こうすると、PCHだけになる。
       pt.bg=c("red","green","blue","yellow","white","grey"),##the background color for the points, corresponding to its argument bg
       pt.cex=2,##expansion factor(s) for the points.
       )
       ##凡例間の距離x.intersp=2,
       ##凡例間の距離y.intersp=1.5)
jpg.copy(sprintf("D:/oracle/howm/2013/04/img/add-5.jpg"),force=0)
##[[../../2013/04/img/add-5.jpg]]

##殆ど、いい加減にスタートする。ジャングル。
try <- hmm(y, yval=c("A","T","G","C","s","e"),
           par0=list(tpm=(0.1*tpm+0.9*par0tmp$tpm),Rho=(0.1*Rho+0.9*par0tmp$Rho)),
           verb=T,stationary=T)
barplot(t(try$tpm),col=1:8, names.arg=paste("state",1:8))
title("transition probability matrix")
legend("bottomright",
       legend=paste("state",1:8),
       col=1:8,
       pch=22,##filled square, 
       lty=0,##線分の形式 (line type) を無し(透明の線)にする．こうすると、PCHだけになる。
       pt.bg=1:8,
       pt.cex=2,##expansion factor(s) for the points.
       )
jpg.copy(sprintf("D:/oracle/howm/2013/04/img/add-6.jpg"),force=0)
##[[../../2013/04/img/add-6.jpg]]

barplot(try$Rho,
        col=c("red","green","blue","yellow","white","grey"),
        names.arg=paste("state",1:8))
title("Rho")
legend("bottomright",
       legend=c("A","T","G","C","s","e"),
       col=c("red","green","blue","yellow","white","grey"),
       pch=22,##filled square, 
       lty=0,##線分の形式 (line type) を無し(透明の線)にする．こうすると、PCHだけになる。
       pt.bg=c("red","green","blue","yellow","white","grey"),##the background color for the points, corresponding to its argument bg
       pt.cex=2,##expansion factor(s) for the points.
       )
       ##凡例間の距離x.intersp=2,
       ##凡例間の距離y.intersp=1.5)
jpg.copy(sprintf("D:/oracle/howm/2013/04/img/add-7.jpg"),force=0)
##[[../../2013/04/img/add-7.jpg]]


##いよいよ本番。
##まず学習用のデータを作る。
splice.All <- read.table("splice.dat",header=TRUE)

table(splice.All[,1])
##  EI   IE    N 
## 764  766 1651

dim(splice.All)##これは、全ての種類のデータが含まれている。
dim(splice.All[splice.All[,1]=="EI",7:15])
ei <- splice.All[splice.All[,1]=="EI",7:15]
dim(ei)

as.character(unlist(ei[1,]))

c("s", as.character(unlist(ei[1,])), "e")

zz<-function(x){c("s", as.character(unlist(x)), "e")}
zz(ei[1,])
b <- t(apply(ei, 1, zz))
dim(b)
b[1,]
ei[1,]
b[2,]
ei[2,]

##y[[1]]


yy <- list(b[1,])
## yy
## yy <- c(yy, list(b[2,]))
## yy
for( i in 2:dim(ei)[1]){
  yy <- c(yy, list(b[i,]))
}
yy[1:10]
y[1:10]

try <- hmm(yy, yval=c("A","T","G","C","s","e"),
           K=10,
           verb=T,stationary=T)

a <- matrix(0,10,10)
diag(a) <- 1
a
a <- cbind(0,a)
a
a <- rbind(a,0)
a
a[11,11] <- 1
tpm <- a

a <- matrix(1/4,6-2,9)
a
a <- cbind(0,a,0)
a
a <- rbind(a,0,0)
a
a[5,1] <- 1
a[6,11] <- 1
Rho <- a

##rand.start <- list(tpm = FALSE, Rho = FALSE)
##適当にtpm,Rhoを作って貰う。
par0tmp <- init.all(nval=6, K=11, rand.start=list(tpm = FALSE, Rho = FALSE), mixture=FALSE)

##try <- hmm(y, yval=c("A","T","G","C","s","e"),par0=parL, verb=T,stationary=T)
##try <- hmm(y, yval=c("A","T","G","C","s","e"),
##           par0=list(tpm=parL$tpm,Rho=parL$Rho), verb=T,stationary=T)
##LeftToRightModelと混ぜてみる。
try <- hmm(y, yval=c("A","T","G","C","s","e"),
           par0=list(tpm=(0.9*tpm+0.1*par0tmp$tpm),Rho=(0.9*Rho+0.1*par0tmp$Rho)),
           verb=T,stationary=T)

barplot(t(try$tpm),col=1:11, names.arg=paste("state",1:11))
title("transition probability matrix")
legend("bottomright",
       legend=paste("state",1:11),
       col=1:11,
       pch=22,##filled square, 
       lty=0,##線分の形式 (line type) を無し(透明の線)にする．こうすると、PCHだけになる。
       pt.bg=1:11,
       pt.cex=2,##expansion factor(s) for the points.
       )
jpg.copy(sprintf("D:/oracle/howm/2013/04/img/add-8.jpg"),force=0)
##[[../../2013/04/img/add-8.jpg]]

barplot(try$Rho,
        col=c("red","green","blue","yellow","white","grey"),
        names.arg=paste("state",1:11))
title("Rho")
legend("bottomright",
       legend=c("A","T","G","C","s","e"),
       col=c("red","green","blue","yellow","white","grey"),
       pch=22,##filled square, 
       lty=0,##線分の形式 (line type) を無し(透明の線)にする．こうすると、PCHだけになる。
       pt.bg=c("red","green","blue","yellow","white","grey"),##the background color for the points, corresponding to its argument bg
       pt.cex=2,##expansion factor(s) for the points.
       )
       ##凡例間の距離x.intersp=2,
       ##凡例間の距離y.intersp=1.5)
jpg.copy(sprintf("D:/oracle/howm/2013/04/img/add-9.jpg"),force=0)
##[[../../2013/04/img/add-9.jpg]]

##でもね、NとEIの区別はつくよ。
dim(splice.All[splice.All[,1]=="N",7:15])
n <- splice.All[splice.All[,1]=="EI",7:15]

zz<-function(x){c("s", as.character(unlist(x)), "e")}
zz(ei[1,])
b <- t(apply(n, 1, zz))
dim(b)
b[1,]
ei[1,]
b[2,]
ei[2,]

##y[[1]]


yy <- list(b[1,])
## yy
## yy <- c(yy, list(b[2,]))
## yy
for( i in 2:dim(ei)[1]){
  yy <- c(yy, list(b[i,]))
}
yy