Skip to content
代码片段 群组 项目
提交 e16bbfa2 编辑于 作者: Haibao Tang's avatar Haibao Tang
浏览文件

[optimize] Simplify optimize interface

上级 82abd1b5
无相关合并请求
......@@ -65,7 +65,8 @@ Optimize uses Genetic Algorithm (GA) to search for the best scoring solution. GA
![ga](tests/test-movie.gif)
```console
allhic optimize tests/test.counts_GATC.txt tests/test.clm
allhic optimize tests/test.counts_GATC.g0.txt tests/test.clm
allhic optimize tests/test.counts_GATC.g1.txt tests/test.clm
```
### <kbd>Build</kbd>
......@@ -83,7 +84,8 @@ Following the 4 steps of `prune`, `extract`, `partition`, `optimize`
```console
allhic extract T4_Chr1/{prunning.sub.bam,seq.fasta}
allhic partition T4_Chr1/{prunning.sub.counts_GATC.txt,prunning.sub.pairs.txt} 2
allhic optimize T4_Chr1/{prunning.sub.counts_GATC.txt,prunning.sub.clm}
allhic optimize T4_Chr1/{prunning.sub.counts_GATC.g0.txt,prunning.sub.clm}
allhic optimize T4_Chr1/{prunning.sub.counts_GATC.g1.txt,prunning.sub.clm}
allhic build T4_Chr/{prunning.sub.tour,seq.fasta}
```
......
......@@ -99,28 +99,28 @@ func NewCLM(Clmfile, REfile string) *CLM {
p.contacts = make(map[Pair]Contact)
p.orientedContacts = make(map[OrientedPair]GArray)
p.ParseIds()
p.ParseClm()
p.readRE()
p.readClm()
return p
}
// ParseIds parses the idsfile into data stored in CLM.
// readRE parses the idsfile into data stored in CLM.
// IDS file has a list of contigs that need to be ordered. 'recover',
// keyword, if available in the third column, is less confident.
// tig00015093 46912
// tig00035238 46779 recover
// tig00030900 119291
func (r *CLM) ParseIds() {
func (r *CLM) readRE() {
file, _ := os.Open(r.REfile)
log.Noticef("Parse idsfile `%s`", r.REfile)
log.Noticef("Parse REfile `%s`", r.REfile)
scanner := bufio.NewScanner(file)
idx := 0
for scanner.Scan() {
words := strings.Fields(scanner.Text())
tig := words[0]
size, _ := strconv.Atoi(words[len(words)-1])
r.Tigs = append(r.Tigs, &TigF{idx, tig, size, false})
r.Tigs = append(r.Tigs, &TigF{idx, tig, size, true})
r.tigToIdx[tig] = idx
idx++
}
......@@ -134,8 +134,8 @@ func rr(b byte) byte {
return '-'
}
// ParseClmLines parses the clmfile into a slice of CLMLine
func ParseClmLines(clmfile string) []CLMLine {
// readClmLines parses the clmfile into a slice of CLMLine
func readClmLines(clmfile string) []CLMLine {
file, _ := os.Open(clmfile)
log.Noticef("Parse clmfile `%s`", clmfile)
reader := bufio.NewReader(file)
......@@ -172,9 +172,9 @@ func ParseClmLines(clmfile string) []CLMLine {
return lines
}
// ParseClm parses the clmfile into data stored in CLM.
func (r *CLM) ParseClm() {
lines := ParseClmLines(r.Clmfile)
// readClm parses the clmfile into data stored in CLM.
func (r *CLM) readClm() {
lines := readClmLines(r.Clmfile)
for _, line := range lines {
// Make sure both contigs are in the ids file
ai, aok := r.tigToIdx[line.at]
......
......@@ -154,7 +154,7 @@ can be generated with the "extract" sub-command.
Name: "optimize",
Usage: "Order-and-orient tigs in a group",
UsageText: `
allhic optimize counts_RE.txt clmfile clusters.txt group_number [options]
allhic optimize counts_RE.txt clmfile [options]
Optimize function:
Given a set of Hi-C contacts between contigs, as specified in the
......@@ -196,15 +196,13 @@ on a cluster).
},
},
Action: func(c *cli.Context) error {
if len(c.Args()) < 4 {
if len(c.Args()) < 2 {
cli.ShowSubcommandHelp(c)
return cli.NewExitError("Must specify clmfile", 1)
}
refile := c.Args().Get(0)
clmfile := c.Args().Get(1)
clustersfile := c.Args().Get(2)
group, _ := strconv.Atoi(c.Args().Get(3))
runGA := !c.Bool("skipGA")
resume := c.Bool("resume")
seed := c.Int64("seed")
......@@ -212,7 +210,6 @@ on a cluster).
ngen := c.Int("ngen")
mutpb := c.Float64("mutpb")
p := allhic.Optimizer{REfile: refile, Clmfile: clmfile,
Clustersfile: clustersfile, Group: group,
RunGA: runGA, Resume: resume,
Seed: seed, NPop: npop, NGen: ngen, MutProb: mutpb}
p.Run()
......
......@@ -151,22 +151,35 @@ func (r *Extracter) makeModel(outfile string) {
r.model = m
}
// readFastaAndWriteRE writes out the number of restriction fragments, one per line
func (r *Extracter) readFastaAndWriteRE() {
outfile := RemoveExt(r.Bamfile) + ".counts_" + r.RE + ".txt"
reader, _ := fastx.NewDefaultReader(r.Fastafile)
// writeRE write a RE file
func writeRE(outfile string, contigs []*ContigInfo) {
f, _ := os.Create(outfile)
w := bufio.NewWriter(f)
defer f.Close()
totalCounts := 0
totalBp := int64(0)
r.contigs = []*ContigInfo{}
r.contigToIdx = map[string]int{}
fmt.Fprintf(w, "#Contig\tRECounts\tLength\n")
for _, contig := range contigs {
totalCounts += contig.recounts
totalBp += int64(contig.length)
fmt.Fprintf(w, "%s\n", contig)
}
w.Flush()
log.Noticef("RE counts (total: %d, avg 1 per %d bp) written to `%s`",
totalCounts, totalBp/int64(totalCounts), outfile)
}
// readFastaAndWriteRE writes out the number of restriction fragments, one per line
func (r *Extracter) readFastaAndWriteRE() {
outfile := RemoveExt(r.Bamfile) + ".counts_" + r.RE + ".txt"
reader, _ := fastx.NewDefaultReader(r.Fastafile)
seq.ValidateSeq = false // This flag makes parsing FASTA much faster
r.contigs = []*ContigInfo{}
r.contigToIdx = map[string]int{}
totalCounts := 0
totalBp := int64(0)
for {
rec, err := reader.Read()
if err == io.EOF {
......@@ -178,18 +191,16 @@ func (r *Extracter) readFastaAndWriteRE() {
length := rec.Seq.Length()
totalCounts += count
totalBp += int64(length)
fmt.Fprintf(w, "%s\t%d\t%d\n", name, count, length)
contig := &ContigInfo{
name: name,
recounts: count, // To account for contigs with 0 RE sites
length: length,
}
r.contigToIdx[name] = len(r.contigs)
r.contigs = append(r.contigs, contig)
}
w.Flush()
log.Noticef("RE counts (total: %d, avg 1 per %d bp) written to `%s`",
totalCounts, totalBp/int64(totalCounts), outfile)
writeRE(outfile, r.contigs)
}
// calcIntraContigs determine the local enrichment of links on this contig.
......@@ -207,7 +218,7 @@ func (r *Extracter) calcIntraContigs() {
// calcInterContigs calculates the MLE of distance between all contigs
func (r *Extracter) calcInterContigs() {
clmfile := RemoveExt(r.Bamfile) + ".clm"
lines := ParseClmLines(clmfile)
lines := readClmLines(clmfile)
contigPairs := make(map[[2]int]*ContigPair)
for i := 0; i < len(lines); i++ {
......
......@@ -19,23 +19,21 @@ import (
// Optimizer runs the order-and-orientation procedure, given a clmfile
type Optimizer struct {
Clmfile string
REfile string
Clustersfile string
Group int
RunGA bool
Resume bool
Seed int64
NPop int
NGen int
MutProb float64
CrossProb float64
REfile string
Clmfile string
RunGA bool
Resume bool
Seed int64
NPop int
NGen int
MutProb float64
CrossProb float64
}
// Run kicks off the Optimizer
func (r *Optimizer) Run() {
clm := NewCLM(r.Clmfile, r.REfile)
tourfile := fmt.Sprintf("%s.g%d.tour", RemoveExt(r.Clmfile), r.Group)
tourfile := RemoveExt(r.REfile) + ".tour"
// Load tourfile if it exists
if _, err := os.Stat(tourfile); r.Resume && err == nil {
......@@ -45,8 +43,6 @@ func (r *Optimizer) Run() {
backupTourFile := tourfile + ".sav"
os.Rename(tourfile, backupTourFile)
log.Noticef("Backup `%s` to `%s`", tourfile, backupTourFile)
} else {
clm.parseClustersFile(r.Clustersfile, r.Group)
}
shuffle := false // If one wants randomized initialization, set this to true
......@@ -71,6 +67,7 @@ func (r *Optimizer) Run() {
break
}
}
clm.printTour(os.Stdout, clm.Tour, "FINAL")
log.Notice("Success")
}
......
......@@ -39,6 +39,7 @@ func (r *Partitioner) Run() {
r.Cluster()
}
r.printClusters()
r.splitRE()
log.Notice("Success")
}
......@@ -183,6 +184,19 @@ func (r *Partitioner) readRE() {
len(r.contigs), r.Contigsfile)
}
// splitRE reads in a three-column tab-separated file
// #Contig REcounts Length
func (r *Partitioner) splitRE() {
for j, cl := range r.clusters {
contigs := []*ContigInfo{}
for _, idx := range cl {
contigs = append(contigs, r.contigs[idx])
}
outfile := fmt.Sprintf("%s.g%d.txt", RemoveExt(r.Contigsfile), j)
writeRE(outfile, contigs)
}
}
// parseDist imports the edges of the contig into a slice of DistLine
// DistLine stores the data structure of the distfile
// #X Y Contig1 Contig2 RE1 RE2 ObservedLinks ExpectedLinksIfAdjacent
......
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册