Commit 6438e800 authored by sauloal's avatar sauloal
Browse files

modular cmap parser

parent e4d0ff56
......@@ -33,204 +33,6 @@ bionano_parser.prototype.add_file = function(file) {
}
}
bionano_parser.prototype.parse = function(clbk) {
if (!( 'key_file' in this.files && 'r_cmap' in this.files && 'q_cmap' in this.files && 'xmap' in this.files )) {
console.error('needs key file, r_cmape, q_cmap and xmap');
} else {
var self = this;
self.parse_key_file( self, function() {
self.parse_r_cmap( self, function() {
self.parse_q_cmap( self, function() {
self.parse_xmap( self, function() {
if ( 'smap' in self.files ) {
self.parse_smap(self, function(s) { clbk(s); } );
} else {
clbk(self);
}
});
});
});
});
}
}
bionano_parser.prototype.report = function(self, clbk) {
self.report_r_cmap( self, clbk);
self.report_q_cmap( self, clbk);
self.report_xmap( self, clbk);
if ( 'smap' in self.files ) {
self.report_scmap(self, clbk);
}
}
bionano_parser.prototype.report_cmap = function(self, ldata, desc, track_name, clbk) {
var filename = ldata['filename' ]; //
var header = ldata['header' ]; // []
var data = ldata['data' ]; // {}
var chr_size = ldata['chr_size' ]; // {}
var chr_sites = ldata['chr_sites' ]; // {}
var total_sites = ldata['total_sites' ]; // 0
var col_names = ldata['col_names' ];
var header_types = ldata['header_types'];
var nick_sites = ldata['nick_sites' ];
console.log('data', data);
var chroms = [];
var chrom_names = {};
var chrom_sizes = {};
for (var chrid in self.data['key_file']['data' ]) {
var cdata = self.data['key_file']['data' ][chrid];
var chrom_name = cdata[0];
var chrom_size = cdata[1];
chroms.push(chrom_name);
chrom_names[chrom_name] = chrid;
chrom_sizes[chrom_name] = chrom_size;
}
chroms.sort();
console.log('chroms ', chroms );
console.log('chrom_names', chrom_names);
var indexOfLabelChannel = col_names.indexOf('LabelChannel');
var indexOfPosition = col_names.indexOf('Position' );
var indexOfCoverage = col_names.indexOf('Coverage' );
var indexOfOccurrence = col_names.indexOf('Occurrence' );
var outdata_bed = [
'#track name="'+track_name+'" description="'+desc+'" useScore=1 src="'+filename+'"'
];
//http://genome.ucsc.edu/goldenPath/help/wiggle.html
var outdata_wig = [];
for ( var chromnum in chroms ) {
var chrom_name = chroms[chromnum];
var chrom_size = chrom_sizes[chrom_name];
var chrom_id = chrom_names[chrom_name];
console.log('chrom_name', chrom_name);
console.log('chrom_id ', chrom_id );
var cdata = data[chrom_id];
console.log(' cdatalen ', cdata.length);
//console.log(' cdata ', cdata );
outdata_wig.push( '#browser position '+chrom_name+':0-'+(chrom_size - 1) );
outdata_wig.push( '#track type=wiggle_0 name="reference" description="BioNano Genomics Reference Nicking Pattern" visibility=full autoScale=off viewLimits=0.0:1.0 color=0,255,00 altColor=255,0,0 alwaysZero=on graphType=bar smoothingWindow=off priority=10' );
outdata_wig.push( 'variableStep chrom='+chrom_name+' span='+nick_sites[Object.keys(nick_sites)[0]][1] );
var out_row_bed = [
chrom_name , // 0 chrom
0, // 1 start
chrom_size - 1, // 2 end
'ref' , // 3 name
1000 , // 4 score
'+' , // 5 strand
0, // 6 thick start
chrom_size - 1, // 7 thick end
0 , // 8 RGB
0 , // 9 count
[0] , // 10 sizes
[0] // 11 starts
];
var lastPos = -1;
var lastFeatSize = -1;
var num_valids = 0;
for ( var rowid in cdata ) {
var row = cdata[rowid];
//console.log('rowid', rowid);
//console.log('row ', row );
var LabelChannel = row[indexOfLabelChannel];
var Position = row[indexOfPosition ];
var Coverage = row[indexOfCoverage ];
var Occurrence = row[indexOfOccurrence ];
/*
console.log('LabelChannel', LabelChannel );
console.log('Position ', Position );
console.log('Coverage ', Coverage );
console.log('Occurrence ', Occurrence );
*/
if ( Occurrence > 0 ) {
if ((lastPos + lastFeatSize) >= Position) {
continue;
}
num_valids++;
/*
if ( num_valids == 1 ) {
out_row_bed[1] = Position;
out_row_bed[6] = Position;
}
*/
var featSize = nick_sites[LabelChannel][1];
lastFeatSize = featSize;
out_row_bed[10].push(featSize);
out_row_bed[11].push(Position);
lastPos = Position;
outdata_wig.push( Position + " 1.0" );
}
}
out_row_bed[ 2] = lastPos+lastFeatSize;
out_row_bed[ 7] = lastPos+lastFeatSize;
out_row_bed[ 9] = out_row_bed[10].length;
out_row_bed[10] = out_row_bed[10].join(',');
out_row_bed[11] = out_row_bed[11].join(',');
out_row_bed = out_row_bed.join("\t");
outdata_bed.push(out_row_bed);
}
console.log('outdata_bed', outdata_bed);
clbk(filename + '.bed', outdata_bed.join("\n"));
console.log('outdata_wig', outdata_wig);
clbk(filename + '.wig', outdata_wig.join("\n"));
}
//https://genome.ucsc.edu/FAQ/FAQformat.html#format1
bionano_parser.prototype.report_r_cmap = function(self, clbk) {
desc = "BioNano Genomics - Reference Nicking Pattern";
track_name = "referenceNicking";
var ldata = self.data['r_cmap' ];
self.report_cmap(self, ldata, desc, track_name);
}
bionano_parser.prototype.report_q_cmap = function(self, clbk) {
desc = "BioNano Genomics - Genomic Mapping Nicking Pattern";
track_name = "MappingNicking";
var ldata = self.data['q_cmap' ];
self.report_cmap(self, ldata, desc, track_name);
}
bionano_parser.prototype.report_xmap = function(self, clbk) {
}
bionano_parser.prototype.report_smap = function(self, clbk) {
}
bionano_parser.prototype.read_file = function(file, clbk) {
/*
......@@ -259,8 +61,8 @@ bionano_parser.prototype.parse_header = function(ldata) {
var header_names = ldata['header'][ldata['header'].length-2].substring(3).split(/\s+/);
var header_types = ldata['header'][ldata['header'].length-1].substring(3).split(/\s+/);
//console.log('header_names', header_names);
//console.log('header_types', header_types);
console.log('header_names', header_names.length, header_names);
console.log('header_types', header_types.length, header_types);
for ( var l in header_types ) {
var ht = header_types[l];
......@@ -276,11 +78,48 @@ bionano_parser.prototype.parse_header = function(ldata) {
ldata['col_names' ] = header_names;
ldata['header_types'] = header_types;
function get_val(key, cols) {
var p = header_names.indexOf(key);
if ( p == -1 ) {
assert(false, "no such column: "+key+" "+header_names);
return false;
} else {
return header_types[p](cols[p]);
}
}
return get_val;
}
bionano_parser.prototype.parse = function(clbk) {
if (!( 'key_file' in this.files && 'r_cmap' in this.files && 'q_cmap' in this.files && 'xmap' in this.files )) {
console.error('needs key file, r_cmape, q_cmap and xmap');
} else {
var self = this;
self.parse_key_file( self, function() {
self.parse_r_cmap( self, function() {
self.parse_q_cmap( self, function() {
self.parse_xmap( self, function() {
if ( 'smap' in self.files ) {
self.parse_smap(self, function(s) { clbk(s); } );
} else {
clbk(self);
}
});
});
});
});
}
}
bionano_parser.prototype.parse_key_file = function(self, clbk) {
var file = self.files['key_file'];
console.log('parse_key_file', file.name);
......@@ -337,22 +176,76 @@ bionano_parser.prototype.parse_key_file = function(self, clbk) {
});
}
bionano_parser.prototype.parse_r_cmap = function(self, clbk) {
var file = self.files['r_cmap' ];
console.log('parse_r_cmap' , file.name);
/*
# CMAP File Version: 0.1
# Label Channels: 1
# Nickase Recognition Site 1: gctcttc
# Enzyme1: Nt.BspQI
# Number of Consensus Nanomaps: 13
#h CMapId ContigLength NumSites SiteID LabelChannel Position StdDev Coverage Occurrence
#f int float int int int float float int int
1 21805821.0 1558 1 1 10672.0 1.0 1 1
1 21805821.0 1558 2 1 31122.0 1.0 1 1
*/
bionano_parser.prototype.add_cmap = function(self, key, cols, get_val, ldata) {
var chrid = get_val('CMapId' , cols);
var chrpos = get_val('LabelChannel', cols);
var chrsize = get_val('ContigLength', cols);
if (! (chrid in ldata['data' ])) {
if (key == "r_cmap") {
/*
console.log("self.data " , self.data);
console.log("self.data.key_file " , self.data['key_file']);
console.log("self.data.key_file.data " , self.data['key_file']['data']);
console.log("self.data.key_file.data."+chrid , self.data['key_file']['data'][chrid]);
console.log("self.data.key_file.data."+chrid+".1", self.data['key_file']['data'][chrid][1]);
*/
assert(chrsize == self.data['key_file']['data'][chrid][1], 'chromosome size mismatch');
}
ldata['data' ][chrid] = [];
ldata['chr_size' ][chrid] = chrsize;
ldata['chr_sites' ][chrid] = 0;
}
for (var c in cols) {
cols[c] = ldata['header_types'][c](cols[c]);
}
ldata['data' ][chrid].push(cols);
ldata['chr_sites' ][chrid]++;
ldata['total_sites']++;
}
bionano_parser.prototype.add_xmap = function(self, key, cols, get_val, ldata) {
// 0 1 2 3 4 5 6 7 8 9 10 11 12 13
// #h XmapEntryID\tQryContigID\tRefContigID\tQryStartPos\tQryEndPos\tRefStartPos\tRefEndPos\tOrientation\tConfidence\tHitEnum\tQryLen\tRefLen\tLabelChannel\tAlignment";
var chrid = get_val('RefContigID', cols);
var chrsize = get_val('RefLen' , cols);
var chrpos = get_val('RefStartPos', cols);
var tgtid = get_val('QryContigID', cols);
var tgtsize = get_val('QryLen' , cols);
var tgtpos = get_val('QryStartPos', cols);
if (! (chrid in ldata['data' ])) {
assert((chrsize == self.data['r_cmap' ]['chr_size'][chrid]), 'reference chromosome size mismatch. '+chrsize+' != '+self.data['r_cmap' ]['chr_size'][chrid]);
assert((tgtsize == self.data['q_cmap' ]['chr_size'][tgtid]), 'target chromosome size mismatch. '+tgtsize+' != '+self.data['q_cmap' ]['chr_size'][tgtid]);
ldata['data' ][chrid] = [];
ldata['chr_size' ][chrid] = chrsize;
ldata['chr_sites' ][chrid] = 0;
}
for (var c in cols) {
cols[c] = ldata['header_types'][c](cols[c]);
}
ldata['data' ][chrid].push(cols);
ldata['chr_sites' ][chrid]++;
ldata['total_sites']++;
}
bionano_parser.prototype.parse_map = function(self, file, key, dfl, adder, clbk) {
self.read_file(file, function (text) {
self.data['r_cmap'] = {
self.data[key] = {
'filename' : file.name,
'header' : [],
'data' : {},
......@@ -362,13 +255,16 @@ bionano_parser.prototype.parse_r_cmap = function(self, clbk) {
'col_names' : [],
'nick_sites' : {}
};
var get_val = null;
//.replace(/(^\s*)|(\s*$)/g,'') trim white spaces
//
var rows = text.split(/\r\n|\n/);
var rows = text.split(/\r\n|\n/);
//console.log(rows);
var datanum = 0;
var ldata = self.data['r_cmap'];
var ldata = self.data[key];
for ( var linenum = 0, line; line = rows[linenum]; linenum++ ) {
//console.log(linenum, line);
if ( line.length > 0 ) {
......@@ -381,51 +277,56 @@ bionano_parser.prototype.parse_r_cmap = function(self, clbk) {
var num = cols[cols.length - 2];
var seq = cols[cols.length - 1];
num = parseInt(num.split(":")[0]);
num = parseInt(num.split(":")[0]);
ldata['nick_sites'][num] = [seq, seq.length];
}
} else { //data
datanum++;
if ( datanum == 1 ) { // col names
var dfl = "#h CMapId\tContigLength\tNumSites\tSiteID\tLabelChannel\tPosition\tStdDev\tCoverage\tOccurrence";
var hdl = ldata['header'][ldata['header'].length-2];
//console.log(ldata['header']);
assert(hdl == dfl, "line: '" + hdl + "' ("+hdl.length+") != " + "'"+dfl+"' ("+dfl.length+")");
self.parse_header(ldata);
get_val = self.parse_header(ldata);
} else {
var cols = line.split(/\s+/);
//console.log("cols", cols);
var chrid = parseInt(cols[0]);
var chrsize = parseInt(cols[1]);
var chrpos = parseInt(cols[5]);
assert(chrsize == self.data['key_file']['data' ][chrid][1], 'chromosome size mismatch');
if (! (chrid in ldata['data' ])) {
ldata['data' ][chrid] = [];
ldata['chr_size' ][chrid] = chrsize;
ldata['chr_sites' ][chrid] = 0;
}
for (var c in cols) {
cols[c] = ldata['header_types'][c](cols[c]);
}
ldata['data' ][chrid].push(cols);
ldata['chr_sites' ][chrid]++;
ldata['total_sites']++;
adder(self, key, cols, get_val, ldata);
}
}
}
}
console.log('parse_r_cmap', file.name, 'parsed', ldata);
console.log('parse_'+key, file.name, 'parsed', ldata);
assert(ldata['data'].length != 0);
clbk();
});
}
bionano_parser.prototype.parse_r_cmap = function(self, clbk) {
var file = self.files['r_cmap' ];
console.log('parse_r_cmap' , file.name);
/*
# CMAP File Version: 0.1
# Label Channels: 1
# Nickase Recognition Site 1: gctcttc
# Enzyme1: Nt.BspQI
# Number of Consensus Nanomaps: 13
#h CMapId ContigLength NumSites SiteID LabelChannel Position StdDev Coverage Occurrence
#f int float int int int float float int int
1 21805821.0 1558 1 1 10672.0 1.0 1 1
1 21805821.0 1558 2 1 31122.0 1.0 1 1
*/
var dfl = "#h CMapId\tContigLength\tNumSites\tSiteID\tLabelChannel\tPosition\tStdDev\tCoverage\tOccurrence";
self.parse_map(self, file, "r_cmap", dfl, self.add_cmap, clbk);
}
bionano_parser.prototype.parse_q_cmap = function(self, clbk) {
var file = self.files['q_cmap' ];
console.log('parse_q_cmap' , file.name);
......@@ -443,65 +344,11 @@ bionano_parser.prototype.parse_q_cmap = function(self, clbk) {
35 4171773.0 1214 1 1 19.9 105.0 39.9 39.9
35 4171773.0 1214 2 1 2189.4 143.8 54.9 46.9
*/
self.read_file(file, function (text) {
self.data['q_cmap'] = {
'header' : [],
'data' : {},
'chr_size' : {},
'chr_sites' : {},
'total_sites': 0
};
//.replace(/(^\s*)|(\s*$)/g,'') trim white spaces
//
var rows = text.split(/\r\n|\n/);
//console.log(rows);
var datanum = 0;
var ldata = self.data['q_cmap'];
for ( var linenum = 0, line; line = rows[linenum]; linenum++ ) {
//console.log(linenum, line);
if ( line.length > 0 ) {
if ( line[0] == "#" ) { // header
ldata['header'].push(line);
} else { //data
datanum++;
if ( datanum == 1 ) { // col names
var dfl = "#h CMapId\tContigLength\tNumSites\tSiteID\tLabelChannel\tPosition\tStdDev\tCoverage\tOccurrence";
var hdl = ldata['header'][ldata['header'].length-2];
assert(hdl == dfl, "line: '" + hdl + "' ("+hdl.length+") != " + "'"+dfl+"' ("+dfl.length+")");
self.parse_header(ldata);
} else {
var cols = line.split(/\s+/);
//console.log("cols", cols);
var chrid = parseInt(cols[0]);
var chrsize = parseInt(cols[1]);
var chrpos = parseInt(cols[5]);
if (! (chrid in ldata['data' ])) {
ldata['data' ][chrid] = [];
ldata['chr_size' ][chrid] = chrsize;
ldata['chr_sites' ][chrid] = 0;
}
for (var c in cols) {
cols[c] = ldata['header_types'][c](cols[c]);
}
ldata['data' ][chrid].push(cols);
ldata['chr_sites' ][chrid]++;
ldata['total_sites']++;
}
}
}
}
console.log('parse_q_cmap', file.name, 'parsed', ldata);
assert(ldata['data'].length != 0);
clbk();
});
var dfl = "#h CMapId\tContigLength\tNumSites\tSiteID\tLabelChannel\tPosition\tStdDev\tCoverage\tOccurrence";
self.parse_map(self, file, "q_cmap", dfl, self.add_cmap, clbk);
}
bionano_parser.prototype.parse_xmap = function(self, clbk) {
var file = self.files['xmap' ];
console.log('parse_xmap' , file.name);
......@@ -519,75 +366,13 @@ bionano_parser.prototype.parse_xmap = function(self, clbk) {
#f int int int float float float float string float string float float int string
1 35 3 3838447.1 4147006.7 7578.0 319624.0 + 19.57 2M1D4M1I7M1D6M 4171773.0 55340444.0 1 (1,1194)(2,1195)(4,1196)(5,1197)(6,1198)(7,1199)(8,1201)(9,1202)(10,1203)(11,1204)(12,1205)(13,1206)(14,1207)(16,1208)(17,1209)(18,1210)(19,1211)(20,1212)(21,1213)
*/
self.read_file(file, function (text) {
self.data['xmap'] = {
'header' : [],
'data' : {},
'chr_sites' : {},
'total_sites': 0
};
//.replace(/(^\s*)|(\s*$)/g,'') trim white spaces
//
var rows = text.split(/\r\n|\n/);
//console.log(rows);
var datanum = 0;
var ldata = self.data['xmap'];
for ( var linenum = 0, line; line = rows[linenum]; linenum++ ) {
//console.log(linenum, line);
if ( line.length > 0 ) {
if ( line[0] == "#" ) { // header
self.data['xmap']['header'].push(line);
} else { //data
datanum++;
if ( datanum == 1 ) { // col names
// 0 1 2 3 4 5 6 7 8 9 10 11 12 13
var dfl = "#h XmapEntryID\tQryContigID\tRefContigID\tQryStartPos\tQryEndPos\tRefStartPos\tRefEndPos\tOrientation\tConfidence\tHitEnum\tQryLen\tRefLen\tLabelChannel\tAlignment";
var hdl = self.data['xmap']['header'][self.data['xmap']['header'].length-2];
assert(hdl == dfl, "line: '" + hdl + "' ("+hdl.length+") != " + "'"+dfl+"' ("+dfl.length+")");
self.parse_header(ldata);
} else {
var cols = line.split(/\s+/);
for (var c in cols) {
cols[c] = ldata['header_types'][c](cols[c]);
}
//console.log("cols", cols);
var chrid = cols[ 2];
var chrsize = cols[11];
var chrpos = cols[ 5];
var tgtid = cols[ 1];
var tgtsize = cols[10];
var tgtpos = cols[ 3];
assert(parseInt(chrsize) == parseInt(self.data['r_cmap' ]['chr_size'][chrid]), 'reference chromosome size mismatch. '+chrsize+' != '+self.data['r_cmap' ]['chr_size'][chrid]);
assert(parseInt(tgtsize) == parseInt(self.data['q_cmap' ]['chr_size'][tgtid]), 'target chromosome size mismatch. '+tgtsize+' != '+self.data['q_cmap' ]['chr_size'][tgtid]);
if (! (chrid in ldata['data' ])) {
ldata['data' ][chrid] = {};
ldata['chr_sites' ][chrid] = 0;
}
if (!(chrpos in ldata['data' ][chrid])) {
ldata['data' ][chrid][chrpos] = [];
}
ldata['data' ][chrid][chrpos].push(cols);
ldata['chr_sites' ][chrid]++;
ldata['total_sites']++;
}
}
}
}
console.log('parse_xmap', file.name, 'parsed', ldata);
assert(ldata.length != 0);