[optimize] Simplify optimize interface

e16bbfa2 · Haibao Tang · 82abd1b5 · e16bbfa2 · e16bbfa2 · e16bbfa2
--- a/README.md
+++ b/README.md
@@ -65,7 +65,8 @@ Optimize uses Genetic Algorithm (GA) to search for the best scoring solution. GA
 ![ga](tests/test-movie.gif)

 ```console
-allhic optimize tests/test.counts_GATC.txt tests/test.clm
+allhic optimize tests/test.counts_GATC.g0.txt tests/test.clm
+allhic optimize tests/test.counts_GATC.g1.txt tests/test.clm
 ```

 ### <kbd>Build</kbd>
@@ -83,7 +84,8 @@ Following the 4 steps of `prune`, `extract`, `partition`, `optimize`
 ```console
 allhic extract T4_Chr1/{prunning.sub.bam,seq.fasta}
 allhic partition T4_Chr1/{prunning.sub.counts_GATC.txt,prunning.sub.pairs.txt} 2
-allhic optimize T4_Chr1/{prunning.sub.counts_GATC.txt,prunning.sub.clm}
+allhic optimize T4_Chr1/{prunning.sub.counts_GATC.g0.txt,prunning.sub.clm}
+allhic optimize T4_Chr1/{prunning.sub.counts_GATC.g1.txt,prunning.sub.clm}
 allhic build T4_Chr/{prunning.sub.tour,seq.fasta}
 ```


--- a/clm.go
+++ b/clm.go
@@ -99,28 +99,28 @@ func NewCLM(Clmfile, REfile string) *CLM {
 	p.contacts = make(map[Pair]Contact)
 	p.orientedContacts = make(map[OrientedPair]GArray)

-	p.ParseIds()
-	p.ParseClm()
+	p.readRE()
+	p.readClm()

 	return p
 }

-// ParseIds parses the idsfile into data stored in CLM.
+// readRE parses the idsfile into data stored in CLM.
 // IDS file has a list of contigs that need to be ordered. 'recover',
 // keyword, if available in the third column, is less confident.
 // tig00015093     46912
 // tig00035238     46779   recover
 // tig00030900     119291
-func (r *CLM) ParseIds() {
+func (r *CLM) readRE() {
 	file, _ := os.Open(r.REfile)
-	log.Noticef("Parse idsfile `%s`", r.REfile)
+	log.Noticef("Parse REfile `%s`", r.REfile)
 	scanner := bufio.NewScanner(file)
 	idx := 0
 	for scanner.Scan() {
 		words := strings.Fields(scanner.Text())
 		tig := words[0]
 		size, _ := strconv.Atoi(words[len(words)-1])
-		r.Tigs = append(r.Tigs, &TigF{idx, tig, size, false})
+		r.Tigs = append(r.Tigs, &TigF{idx, tig, size, true})
 		r.tigToIdx[tig] = idx
 		idx++
 	}
@@ -134,8 +134,8 @@ func rr(b byte) byte {
 	return '-'
 }

-// ParseClmLines parses the clmfile into a slice of CLMLine
-func ParseClmLines(clmfile string) []CLMLine {
+// readClmLines parses the clmfile into a slice of CLMLine
+func readClmLines(clmfile string) []CLMLine {
 	file, _ := os.Open(clmfile)
 	log.Noticef("Parse clmfile `%s`", clmfile)
 	reader := bufio.NewReader(file)
@@ -172,9 +172,9 @@ func ParseClmLines(clmfile string) []CLMLine {
 	return lines
 }

-// ParseClm parses the clmfile into data stored in CLM.
-func (r *CLM) ParseClm() {
-	lines := ParseClmLines(r.Clmfile)
+// readClm parses the clmfile into data stored in CLM.
+func (r *CLM) readClm() {
+	lines := readClmLines(r.Clmfile)
 	for _, line := range lines {
 		// Make sure both contigs are in the ids file
 		ai, aok := r.tigToIdx[line.at]

--- a/cmd/allhic.go
+++ b/cmd/allhic.go
@@ -154,7 +154,7 @@ can be generated with the "extract" sub-command.
 			Name:  "optimize",
 			Usage: "Order-and-orient tigs in a group",
 			UsageText: `
-	allhic optimize counts_RE.txt clmfile clusters.txt group_number [options]
+	allhic optimize counts_RE.txt clmfile [options]

 Optimize function:
 Given a set of Hi-C contacts between contigs, as specified in the
@@ -196,15 +196,13 @@ on a cluster).
 				},
 			},
 			Action: func(c *cli.Context) error {
-				if len(c.Args()) < 4 {
+				if len(c.Args()) < 2 {
 					cli.ShowSubcommandHelp(c)
 					return cli.NewExitError("Must specify clmfile", 1)
 				}

 				refile := c.Args().Get(0)
 				clmfile := c.Args().Get(1)
-				clustersfile := c.Args().Get(2)
-				group, _ := strconv.Atoi(c.Args().Get(3))
 				runGA := !c.Bool("skipGA")
 				resume := c.Bool("resume")
 				seed := c.Int64("seed")
@@ -212,7 +210,6 @@ on a cluster).
 				ngen := c.Int("ngen")
 				mutpb := c.Float64("mutpb")
 				p := allhic.Optimizer{REfile: refile, Clmfile: clmfile,
-					Clustersfile: clustersfile, Group: group,
 					RunGA: runGA, Resume: resume,
 					Seed: seed, NPop: npop, NGen: ngen, MutProb: mutpb}
 				p.Run()

--- a/extract.go
+++ b/extract.go
@@ -151,22 +151,35 @@ func (r *Extracter) makeModel(outfile string) {
 	r.model = m
 }

-// readFastaAndWriteRE writes out the number of restriction fragments, one per line
-func (r *Extracter) readFastaAndWriteRE() {
-	outfile := RemoveExt(r.Bamfile) + ".counts_" + r.RE + ".txt"
-
-	reader, _ := fastx.NewDefaultReader(r.Fastafile)
+// writeRE write a RE file
+func writeRE(outfile string, contigs []*ContigInfo) {
 	f, _ := os.Create(outfile)
 	w := bufio.NewWriter(f)
 	defer f.Close()
 	totalCounts := 0
 	totalBp := int64(0)
-	r.contigs = []*ContigInfo{}
-	r.contigToIdx = map[string]int{}
-
 	fmt.Fprintf(w, "#Contig\tRECounts\tLength\n")
+	for _, contig := range contigs {
+		totalCounts += contig.recounts
+		totalBp += int64(contig.length)
+		fmt.Fprintf(w, "%s\n", contig)
+	}
+	w.Flush()
+	log.Noticef("RE counts (total: %d, avg 1 per %d bp) written to `%s`",
+		totalCounts, totalBp/int64(totalCounts), outfile)
+}
+
+// readFastaAndWriteRE writes out the number of restriction fragments, one per line
+func (r *Extracter) readFastaAndWriteRE() {
+	outfile := RemoveExt(r.Bamfile) + ".counts_" + r.RE + ".txt"
+	reader, _ := fastx.NewDefaultReader(r.Fastafile)
 	seq.ValidateSeq = false // This flag makes parsing FASTA much faster

+	r.contigs = []*ContigInfo{}
+	r.contigToIdx = map[string]int{}
+	totalCounts := 0
+	totalBp := int64(0)
+
 	for {
 		rec, err := reader.Read()
 		if err == io.EOF {
@@ -178,18 +191,16 @@ func (r *Extracter) readFastaAndWriteRE() {
 		length := rec.Seq.Length()
 		totalCounts += count
 		totalBp += int64(length)
-		fmt.Fprintf(w, "%s\t%d\t%d\n", name, count, length)
 		contig := &ContigInfo{
 			name:     name,
 			recounts: count, // To account for contigs with 0 RE sites
 			length:   length,
 		}
+
 		r.contigToIdx[name] = len(r.contigs)
 		r.contigs = append(r.contigs, contig)
 	}
-	w.Flush()
-	log.Noticef("RE counts (total: %d, avg 1 per %d bp) written to `%s`",
-		totalCounts, totalBp/int64(totalCounts), outfile)
+	writeRE(outfile, r.contigs)
 }

 // calcIntraContigs determine the local enrichment of links on this contig.
@@ -207,7 +218,7 @@ func (r *Extracter) calcIntraContigs() {
 // calcInterContigs calculates the MLE of distance between all contigs
 func (r *Extracter) calcInterContigs() {
 	clmfile := RemoveExt(r.Bamfile) + ".clm"
-	lines := ParseClmLines(clmfile)
+	lines := readClmLines(clmfile)
 	contigPairs := make(map[[2]int]*ContigPair)

 	for i := 0; i < len(lines); i++ {

--- a/optimize.go
+++ b/optimize.go
@@ -19,23 +19,21 @@ import (

 // Optimizer runs the order-and-orientation procedure, given a clmfile
 type Optimizer struct {
-	Clmfile      string
-	REfile       string
-	Clustersfile string
-	Group        int
-	RunGA        bool
-	Resume       bool
-	Seed         int64
-	NPop         int
-	NGen         int
-	MutProb      float64
-	CrossProb    float64
+	REfile    string
+	Clmfile   string
+	RunGA     bool
+	Resume    bool
+	Seed      int64
+	NPop      int
+	NGen      int
+	MutProb   float64
+	CrossProb float64
 }

 // Run kicks off the Optimizer
 func (r *Optimizer) Run() {
 	clm := NewCLM(r.Clmfile, r.REfile)
-	tourfile := fmt.Sprintf("%s.g%d.tour", RemoveExt(r.Clmfile), r.Group)
+	tourfile := RemoveExt(r.REfile) + ".tour"

 	// Load tourfile if it exists
 	if _, err := os.Stat(tourfile); r.Resume && err == nil {
@@ -45,8 +43,6 @@ func (r *Optimizer) Run() {
 		backupTourFile := tourfile + ".sav"
 		os.Rename(tourfile, backupTourFile)
 		log.Noticef("Backup `%s` to `%s`", tourfile, backupTourFile)
-	} else {
-		clm.parseClustersFile(r.Clustersfile, r.Group)
 	}

 	shuffle := false // If one wants randomized initialization, set this to true
@@ -71,6 +67,7 @@ func (r *Optimizer) Run() {
 			break
 		}
 	}
+	clm.printTour(os.Stdout, clm.Tour, "FINAL")
 	log.Notice("Success")
 }


--- a/partition.go
+++ b/partition.go
@@ -39,6 +39,7 @@ func (r *Partitioner) Run() {
 		r.Cluster()
 	}
 	r.printClusters()
+	r.splitRE()
 	log.Notice("Success")
 }

@@ -183,6 +184,19 @@ func (r *Partitioner) readRE() {
 		len(r.contigs), r.Contigsfile)
 }

+// splitRE reads in a three-column tab-separated file
+// #Contig    REcounts    Length
+func (r *Partitioner) splitRE() {
+	for j, cl := range r.clusters {
+		contigs := []*ContigInfo{}
+		for _, idx := range cl {
+			contigs = append(contigs, r.contigs[idx])
+		}
+		outfile := fmt.Sprintf("%s.g%d.txt", RemoveExt(r.Contigsfile), j)
+		writeRE(outfile, contigs)
+	}
+}
+
 // parseDist imports the edges of the contig into a slice of DistLine
 // DistLine stores the data structure of the distfile
 // #X      Y       Contig1 Contig2 RE1     RE2     ObservedLinks   ExpectedLinksIfAdjacent