1
2
3
4
5
6
7
8
9
10
11
12
13 """Parse Unigene flat file format files such as the Hs.data file.
14
15 Here is an overview of the flat file format that this parser deals with:
16 Line types/qualifiers:
17
18 ID UniGene cluster ID
19 TITLE Title for the cluster
20 GENE Gene symbol
21 CYTOBAND Cytological band
22 EXPRESS Tissues of origin for ESTs in cluster
23 RESTR_EXPR Single tissue or development stage contributes
24 more than half the total EST frequency for this gene.
25 GNM_TERMINUS genomic confirmation of presence of a 3' terminus;
26 T if a non-templated polyA tail is found among
27 a cluster's sequences; else
28 I if templated As are found in genomic sequence or
29 S if a canonical polyA signal is found on
30 the genomic sequence
31 GENE_ID Entrez gene identifier associated with at least one
32 sequence in this cluster;
33 to be used instead of LocusLink.
34 LOCUSLINK LocusLink identifier associated with at least one
35 sequence in this cluster;
36 deprecated in favor of GENE_ID
37 HOMOL Homology;
38 CHROMOSOME Chromosome. For plants, CHROMOSOME refers to mapping
39 on the arabidopsis genome.
40 STS STS
41 ACC= GenBank/EMBL/DDBJ accession number of STS
42 [optional field]
43 UNISTS= identifier in NCBI's UNISTS database
44 TXMAP Transcript map interval
45 MARKER= Marker found on at least one sequence in this
46 cluster
47 RHPANEL= Radiation Hybrid panel used to place marker
48 PROTSIM Protein Similarity data for the sequence with
49 highest-scoring protein similarity in this cluster
50 ORG= Organism
51 PROTGI= Sequence GI of protein
52 PROTID= Sequence ID of protein
53 PCT= Percent alignment
54 ALN= length of aligned region (aa)
55 SCOUNT Number of sequences in the cluster
56 SEQUENCE Sequence
57 ACC= GenBank/EMBL/DDBJ accession number of sequence
58 NID= Unique nucleotide sequence identifier (gi)
59 PID= Unique protein sequence identifier (used for
60 non-ESTs)
61 CLONE= Clone identifier (used for ESTs only)
62 END= End (5'/3') of clone insert read (used for
63 ESTs only)
64 LID= Library ID; see Hs.lib.info for library name
65 and tissue
66 MGC= 5' CDS-completeness indicator; if present, the
67 clone associated with this sequence is believed
68 CDS-complete. A value greater than 511 is the gi
69 of the CDS-complete mRNA matched by the EST,
70 otherwise the value is an indicator of the
71 reliability of the test indicating CDS
72 completeness; higher values indicate more
73 reliable CDS-completeness predictions.
74 SEQTYPE= Description of the nucleotide sequence.
75 Possible values are mRNA, EST and HTC.
76 TRACE= The Trace ID of the EST sequence, as provided by
77 NCBI Trace Archive
78 """
79
80
82 """Store the information for one SEQUENCE line from a Unigene file
83
84 Initialize with the text part of the SEQUENCE line, or nothing.
85
86 Attributes and descriptions (access as LOWER CASE)
87 ACC= GenBank/EMBL/DDBJ accession number of sequence
88 NID= Unique nucleotide sequence identifier (gi)
89 PID= Unique protein sequence identifier (used for non-ESTs)
90 CLONE= Clone identifier (used for ESTs only)
91 END= End (5'/3') of clone insert read (used for ESTs only)
92 LID= Library ID; see Hs.lib.info for library name and tissue
93 MGC= 5' CDS-completeness indicator; if present,
94 the clone associated with this sequence
95 is believed CDS-complete. A value greater than 511
96 is the gi of the CDS-complete mRNA matched by the EST,
97 otherwise the value is an indicator of the reliability
98 of the test indicating CDS completeness;
99 higher values indicate more reliable CDS-completeness
100 predictions.
101 SEQTYPE= Description of the nucleotide sequence. Possible values
102 are mRNA, EST and HTC.
103 TRACE= The Trace ID of the EST sequence, as provided by NCBI
104 Trace Archive
105 """
106
108 self.acc = ''
109 self.nid = ''
110 self.lid = ''
111 self.pid = ''
112 self.clone = ''
113 self.image = ''
114 self.is_image = False
115 self.end = ''
116 self.mgc = ''
117 self.seqtype = ''
118 self.trace = ''
119 if not text==None:
120 self.text=text
121 self._init_from_text(text)
122
123 - def _init_from_text(self,text):
124 parts = text.split('; ');
125 for part in parts:
126 key, val = part.split("=")
127 if key=='CLONE':
128 if val[:5]=='IMAGE':
129 self.is_image=True
130 self.image = val[6:]
131 setattr(self,key.lower(),val)
132
135
136
138 """Store the information for one PROTSIM line from a Unigene file
139
140 Initialize with the text part of the PROTSIM line, or nothing.
141
142 Attributes and descriptions (access as LOWER CASE)
143 ORG= Organism
144 PROTGI= Sequence GI of protein
145 PROTID= Sequence ID of protein
146 PCT= Percent alignment
147 ALN= length of aligned region (aa)
148 """
149
151 self.org = ''
152 self.protgi = ''
153 self.protid = ''
154 self.pct = ''
155 self.aln = ''
156 if not text==None:
157 self.text=text
158 self._init_from_text(text)
159
160 - def _init_from_text(self,text):
161 parts = text.split('; ');
162
163 for part in parts:
164 key, val = part.split("=")
165 setattr(self,key.lower(),val)
166
169
170
172 """Store the information for one STS line from a Unigene file
173
174 Initialize with the text part of the STS line, or nothing.
175
176 Attributes and descriptions (access as LOWER CASE)
177
178 ACC= GenBank/EMBL/DDBJ accession number of STS [optional field]
179 UNISTS= identifier in NCBI's UNISTS database
180 """
181
188
189 - def _init_from_text(self,text):
190 parts = text.split(' ');
191
192 for part in parts:
193 key, val = part.split("=")
194 setattr(self,key.lower(),val)
195
198
199
201 """Store a Unigene record
202
203 Here is what is stored:
204
205 self.ID = '' # ID line
206 self.species = '' # Hs, Bt, etc.
207 self.title = '' # TITLE line
208 self.symbol = '' # GENE line
209 self.cytoband = '' # CYTOBAND line
210 self.express = [] # EXPRESS line, parsed on ';'
211 # Will be an array of strings
212 self.restr_expr = '' # RESTR_EXPR line
213 self.gnm_terminus = '' # GNM_TERMINUS line
214 self.gene_id = '' # GENE_ID line
215 self.locuslink = '' # LOCUSLINK line
216 self.homol = '' # HOMOL line
217 self.chromosome = '' # CHROMOSOME line
218 self.protsim = [] # PROTSIM entries, array of Protsims
219 # Type ProtsimLine
220 self.sequence = [] # SEQUENCE entries, array of Sequence entries
221 # Type SequenceLine
222 self.sts = [] # STS entries, array of STS entries
223 # Type STSLine
224 self.txmap = [] # TXMAP entries, array of TXMap entries
225 """
226
228 self.ID = ''
229 self.species = ''
230 self.title = ''
231 self.symbol = ''
232 self.cytoband = ''
233 self.express = []
234 self.restr_expr = ''
235 self.gnm_terminus = ''
236 self.gene_id = ''
237 self.locuslink = ''
238 self.homol = ''
239 self.chromosome = ''
240 self.protsim = []
241 self.sequence = []
242 self.sts = []
243 self.txmap = []
244
246 return "<%s> %s %s\n%s" % (self.__class__.__name__,
247 self.ID, self.symbol, self.title)
248
255
256
266
267
268
269
270
323
324
325
326
327
328 from Bio.ParserSupport import *
329 import re
330 import Bio
331
332
333
334
335 UG_INDENT=12
336
338 """Store the information for one SEQUENCE line from a Unigene file
339 (DEPRECATED).
340
341 Initialize with the text part of the SEQUENCE line, or nothing.
342
343 Attributes and descriptions (access as LOWER CASE)
344 ACC= GenBank/EMBL/DDBJ accession number of sequence
345 NID= Unique nucleotide sequence identifier (gi)
346 PID= Unique protein sequence identifier (used for non-ESTs)
347 CLONE= Clone identifier (used for ESTs only)
348 END= End (5'/3') of clone insert read (used for ESTs only)
349 LID= Library ID; see Hs.lib.info for library name and tissue
350 MGC= 5' CDS-completeness indicator; if present,
351 the clone associated with this sequence
352 is believed CDS-complete. A value greater than 511
353 is the gi of the CDS-complete mRNA matched by the EST,
354 otherwise the value is an indicator of the reliability
355 of the test indicating CDS comleteness;
356 higher values indicate more reliable CDS-completeness predictions.
357 SEQTYPE= Description of the nucleotide sequence. Possible values are
358 mRNA, EST and HTC.
359 TRACE= The Trace ID of the EST sequence, as provided by NCBI Trace Archive
360 PERIPHERAL= Indicator that the sequence is a suboptimal
361 representative of the gene represented by this cluster.
362 Peripheral sequences are those that are in a cluster
363 which represents a spliced gene without sharing a
364 splice junction with any other sequence. In many
365 cases, they are unspliced transcripts originating
366 from the gene.
367
368 This class is DEPRECATED; please use the read() function in this module
369 instead.
370 """
371
373 import warnings
374 warnings.warn("Bio.UniGene.UnigeneSequenceRecord is deprecated; please use the read() function in this module instead", Bio.BiopythonDeprecationWarning)
375 self.acc = ''
376 self.nid = ''
377 self.lid = ''
378 self.pid = ''
379 self.clone = ''
380 self.image = ''
381 self.is_image = False
382 self.end = ''
383 self.mgc = ''
384 self.seqtype = ''
385 self.Trace = ''
386 self.peripheral = ''
387 if not text==None:
388 self.text=text
389 return self._init_from_text(text)
390
391 - def _init_from_text(self,text):
392 parts = text.split('; ');
393 for part in parts:
394 key,val = re.match('(\w+)=(\S+)',part).groups()
395 if key=='CLONE':
396 if val[:5]=='IMAGE':
397 self.is_image=True
398 self.image = val[6:]
399 setattr(self,key.lower(),val)
400
403
404
406 """Store the information for one PROTSIM line from a Unigene file
407 (DEPRECATED).
408
409 Initialize with the text part of the PROTSIM line, or nothing.
410
411 Attributes and descriptions (access as LOWER CASE)
412 ORG= Organism
413 PROTGI= Sequence GI of protein
414 PROTID= Sequence ID of protein
415 PCT= Percent alignment
416 ALN= length of aligned region (aa)
417
418 This class is DEPRECATED; please use the read() function in this module
419 instead.
420 """
421
423 import warnings
424 warnings.warn("Bio.UniGene.UnigeneProtsimRecord is deprecated; please use the read() function in this module instead", Bio.BiopythonDeprecationWarning)
425 self.org = ''
426 self.protgi = ''
427 self.protid = ''
428 self.pct = ''
429 self.aln = ''
430 if not text==None:
431 self.text=text
432 return self._init_from_text(text)
433
434 - def _init_from_text(self,text):
435 parts = text.split('; ');
436
437 for part in parts:
438 key,val = re.match('(\w+)=(\S+)',part).groups()
439 setattr(self,key.lower(),val)
440
443
444
446 """Store the information for one STS line from a Unigene file
447 (DEPRECATED).
448
449 Initialize with the text part of the STS line, or nothing.
450
451 Attributes and descriptions (access as LOWER CASE)
452
453 NAME= Name of STS
454 ACC= GenBank/EMBL/DDBJ accession number of STS [optional field]
455 DSEG= GDB Dsegment number [optional field]
456 UNISTS= identifier in NCBI's UNISTS database
457
458 This class is DEPRECATED; please use the read() function in this module
459 instead.
460 """
461
472
473 - def _init_from_text(self,text):
474 parts = text.split(' ');
475
476 for part in parts:
477 key,val = re.match('(\w+)=(\S+)',part).groups()
478 setattr(self,key.lower(),val)
479
482
483
485 """Store a Unigene record (DEPRECATED).
486
487 Here is what is stored:
488
489 self.ID = '' # ID line
490 self.species = '' # Hs, Bt, etc.
491 self.title = '' # TITLE line
492 self.symbol = '' # GENE line
493 self.cytoband = '' # CYTOBAND line
494 self.express = [] # EXPRESS line, parsed on ';'
495 # Will be an array of strings
496 self.restr_expr = '' # RESTR_EXPR line
497 self.gnm_terminus = '' # GNM_TERMINUS line
498 self.gene_id = '' # GENE_ID line
499 self.chromosome = '' # CHROMOSOME
500 self.protsim = [] # PROTSIM entries, array of Protsims
501 # Type UnigeneProtsimRecord
502 self.sequence = [] # SEQUENCE entries, array of Sequence entries
503 # Type UnigeneSequenceRecord
504 self.sts = [] # STS entries, array of STS entries
505 # Type UnigeneSTSRecord
506 self.txmap = [] # TXMAP entries, array of TXMap entries
507
508 This class is DEPRECATED; please use the read() function in this module
509 instead.
510 """
511
513 import warnings
514 warnings.warn("Bio.UniGene.UnigeneRecord is deprecated; please use the read() function in this module instead", Bio.BiopythonDeprecationWarning)
515 self.ID = ''
516 self.species = ''
517 self.title = ''
518 self.symbol = ''
519 self.cytoband = ''
520 self.express = []
521 self.restr_expr = ''
522 self.gnm_terminus = ''
523 self.gene_id = ''
524 self.chromosome = ''
525 self.protsim = []
526 self.sequence = []
527 self.sts = []
528 self.txmap = []
529
531 return "<%s> %s %s\n%s" % (self.__class__.__name__,
532 self.ID, self.symbol, self.title)
533
534
536 """This class is DEPRECATED; please use the read() function in this module
537 instead."""
538
548 - def GENE(self,line):
566 - def STS(self,line):
569
570
571 - def _get_single_entry(self,line):
572 """Consume a single-value line
573 """
574 return line[UG_INDENT:]
575
576 - def _get_array_entry(self,line,split_on):
577 """Consume a multi-value line by splitting on split_on
578 """
579 return line[UG_INDENT:].split(split_on)
580
581
583 """Scans a Unigene Flat File Format file (DEPRECATED).
584
585 This class is DEPRECATED; please use the read() function in this module
586 instead.
587 """
588
590 import warnings
591 warnings.warn("Bio.UniGene._Scanner is deprecated; please use the read() function in this module instead", Bio.BiopythonDeprecationWarning)
592
593 - def feed(self, handle, consumer):
594 """feed(self, handle, consumer)
595
596 Feed events from parsing a Unigene file to a consumer.
597 handle is a file-like object, and consumer is a consumer object
598 that will receive events as the file is scanned
599
600 """
601 consumer.start_record()
602 for line in handle:
603 tag = line.split(' ')[0]
604 line = line.rstrip()
605 if line=='//':
606 consumer.end_record()
607 break
608 try:
609 f = getattr(consumer, tag)
610 except AttributeError:
611 print 'no method called', tag
612 else:
613 if callable(f):
614 f(line)
615
616
618 """This class is DEPRECATED; please use the read() function in this module
619 instead."""
620
626
627 - def parse(self, handle):
634
636 """This class is DEPRECATED; please use the parse() function in this module
637 instead."""
638
639 - def __init__(self, handle, parser=None):
643
645 self._parser = RecordParser()
646 lines = []
647 while True:
648 line = self._uhandle.readline()
649 if not line: break
650 if line[:2] == '//':
651 break
652 lines.append(line)
653 if not lines:
654 return None
655 lines.append('//')
656 data = ''.join(lines)
657 if self._parser is not None:
658 return self._parser.parse(File.StringHandle(data))
659 return data
660
662 return iter(self.next, None)
663