1: <?php
2:
3: namespace cli_import;
4:
5: require_once ROOT . 'classes/AbstractImporter.php';
6: require_once ROOT . 'commands/Importer_Sequence_Ids.php';
7:
8: 9: 10:
11: class Importer_Annotations_Repeatmasker extends AbstractImporter {
12:
13: 14: 15:
16: static function import($options) {
17:
18: $filename = $options['file'];
19: $lines_total = trim(`wc -l $filename | cut -d' ' -f1`);
20: self::setLineCount($lines_total);
21:
22: global $db;
23:
24: $regex = <<<EOF
25: {^
26: \d+[ ]
27: # 1320 = Smith-Waterman score of the match, usually complexity adjusted
28: \d+\.\d+[ ]
29: # 15.6 = % divergence = mismatches/(matches+mismatches) **
30: \d+\.\d+[ ]
31: # 6.2 = % of bases opposite a gap in the query sequence (deleted bp)
32: \d+\.\d+[ ]
33: # 0.0 = % of bases opposite a gap in the repeat consensus (inserted bp)
34: (?<name>\w+)[ ]
35: # HSU08988 = name of query sequence
36: (?<start>\d+)[ ]
37: # 6563 = starting position of match in query sequence
38: (?<end>\d+)[ ]
39: # 6781 = ending position of match in query sequence
40: \(\d+\)[ ]
41: # (22462) = no. of bases in query sequence past the ending position of match
42: (?:[C+][ ])?
43: # C = match is with the Complement of the repeat consensus sequence
44: (?<repeat_name>[\w()-?]+)\#
45: # MER7A = name of the matching interspersed repeat
46: (?<repeat_class>[\w()-?]+)
47: (?:/(?<repeat_family>[\w()-?]+))?[ ]
48: # DNA/MER2_type = the class of the repeat, in this case a DNA transposon fossil of the MER2 group (see below for list and references)
49: \(?\d+\)?[ ]
50: # (0) = no. of bases in (complement of) the repeat consensus sequence prior to beginning of the match (0 means that the match extended all the way to the end of the repeat consensus sequence)
51: \(?\d+\)?[ ]
52: # 337 = starting position of match in repeat consensus sequence
53: \(?\d+\)?[ ]
54: # 104 = ending position of match in repeat consensus sequence
55: \d+
56: # 20 = unique identifier for individual insertions
57: $}x
58: EOF;
59:
60: $lines_imported = 0;
61: $families_added = 0;
62:
63: try {
64: $db->beginTransaction();
65: $import_prefix_id = Importer_Sequence_Ids::get_import_dbxref();
66:
67:
68: $param_name = null;
69: $param_uniquename = null;
70: $param_cvterm = null;
71: $param_value = null;
72: $param_fmin = null;
73: $param_fmax = null;
74: $param_srcfeature_uniq = null;
75:
76:
77: $statement_insert_repeat = $db->prepare('INSERT INTO feature (name, uniquename, type_id, organism_id, dbxref_id) VALUES (:name, :uniquename, :type_id, :organism_id, :dbxref_id)');
78: $statement_insert_repeat->bindValue('type_id', CV_ANNOTATION_REPEATMASKER, PDO::PARAM_INT);
79: $statement_insert_repeat->bindValue('organism_id', DB_ORGANISM_ID, PDO::PARAM_INT);
80: $statement_insert_repeat->bindParam('name', $param_name, PDO::PARAM_STR);
81: $statement_insert_repeat->bindParam('uniquename', $param_uniquename, PDO::PARAM_STR);
82: $statement_insert_repeat->bindValue('dbxref_id', $import_prefix_id, PDO::PARAM_INT);
83:
84:
85: $statement_insert_featureloc = $db->prepare(sprintf('INSERT INTO featureloc (fmin, fmax, strand, feature_id, srcfeature_id) VALUES (:fmin, :fmax, :strand, currval(\'feature_feature_id_seq\'), (%s))', 'SELECT feature_id FROM feature WHERE uniquename=:srcfeature_uniquename AND organism_id=:organism LIMIT 1'));
86: $statement_insert_featureloc->bindParam('fmin', $param_fmin, PDO::PARAM_INT);
87: $statement_insert_featureloc->bindParam('fmax', $param_fmax, PDO::PARAM_INT);
88: $statement_insert_featureloc->bindValue('strand', 1, PDO::PARAM_INT);
89: $statement_insert_featureloc->bindParam('srcfeature_uniquename', $param_srcfeature_uniq, PDO::PARAM_STR);
90: $statement_insert_featureloc->bindValue('organism', DB_ORGANISM_ID, PDO::PARAM_INT);
91:
92:
93: $statement_annotate_domain = $db->prepare('INSERT INTO featureprop (feature_id, type_id, value) VALUES (currval(\'feature_feature_id_seq\'), :cvterm, :value)');
94: $statement_annotate_domain->bindParam('cvterm', $param_cvterm, PDO::PARAM_INT);
95: $statement_annotate_domain->bindParam('value', $param_value, PDO::PARAM_STR);
96:
97: $file = fopen($filename, 'r');
98: while (($line = trim(fgets($file))) != false) {
99: $matches = null;
100:
101:
102: if (preg_match($regex, $line, $matches) !== 1) {
103: self::$log->log(sprintf("line does not match, skipping:\n\t" . $line), PEAR_LOG_NOTICE);
104: continue;
105: } else {
106: $param_name = sprintf("%s(%d-%d):%s#%s(%s)"
107: , $matches['name']
108: , $matches['start']
109: , $matches['end']
110: , $matches['repeat_name']
111: , $matches['repeat_class']
112: , (isset($matches['repeat_family']) ? $matches['repeat_family'] : '')
113: );
114: $param_uniquename = IMPORT_PREFIX . "_" . $param_name;
115:
116: $statement_insert_repeat->execute();
117:
118:
119: $param_srcfeature_uniq = IMPORT_PREFIX . "_" . $matches['name'];
120: $param_fmin = $matches['start'];
121: $param_fmax = $matches['end'];
122:
123: $statement_insert_featureloc->execute();
124:
125:
126: $param_cvterm = CV_REPEAT_NAME;
127: $param_value = $matches['repeat_name'];
128: $statement_annotate_domain->execute();
129:
130:
131: $param_cvterm = CV_REPEAT_CLASS;
132: $param_value = $matches['repeat_class'];
133: $statement_annotate_domain->execute();
134:
135:
136: if (!empty($matches['repeat_family'])) {
137: $param_cvterm = CV_REPEAT_FAMILY;
138: $param_value = $matches['repeat_family'];
139: $statement_annotate_domain->execute();
140: $families_added++;
141: }
142:
143:
144: self::updateProgress(++$lines_imported);
145: }
146: }
147: self::preCommitMsg();
148: if (!$db->commit()) {
149: $err = $db->errorInfo();
150: throw new ErrorException($err[2], ERRCODE_TRANSACTION_NOT_COMPLETED, 1);
151: }
152: } catch (\Exception $error) {
153: $db->rollback();
154: throw $error;
155: }
156: return array(LINES_IMPORTED => $lines_imported, 'families_added' => $families_added);
157: }
158:
159: 160: 161:
162: public static function CLI_commandDescription() {
163: return "Repeatmasker Output Importer";
164: }
165:
166: 167: 168:
169: public static function CLI_commandName() {
170: return "annotation_repeatmasker";
171: }
172:
173: 174: 175:
176: public static function CLI_longHelp() {
177: return <<<EOF
178:
179: \033[0;31mThis import requires a successful Sequence ID Import!\033[0m
180: EOF;
181: }
182:
183: }
184:
185: ?>
186: