Home | Trees | Indices | Help |
|
---|
|
1 # copyright 2006-2011 LOGILAB S.A. (Paris, FRANCE), all rights reserved. 2 # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr 3 # 4 # This file is part of logilab-mtconverter. 5 # 6 # logilab-mtconverter is free software: you can redistribute it and/or modify it 7 # under the terms of the GNU Lesser General Public License as published by the 8 # Free Software Foundation, either version 2.1 of the License, or (at your 9 # option) any later version. 10 # 11 # logilab-mtconverter is distributed in the hope that it will be useful, but 12 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 14 # for more details. 15 # 16 # You should have received a copy of the GNU Lesser General Public License along 17 # with logilab-mtconverter. If not, see <http://www.gnu.org/licenses/>. 18 import os 19 from tempfile import mkstemp 20 import subprocess 21 22 from logilab.mtconverter import MissingBinary 23 from logilab.mtconverter.transform import Transform 24 25 bin_search_path = [path for path in os.environ['PATH'].split(os.pathsep) 26 if os.path.isdir(path)] 27 2830 """search the bin_search_path for a given binary returning its fullname or 31 raises MissingBinary""" 32 result = None 33 mode = os.R_OK | os.X_OK 34 for path in bin_search_path: 35 pathbin = os.path.join(path, binary) 36 if os.access(pathbin, mode) == 1: 37 return pathbin 38 break 39 raise MissingBinary('Unable to find binary "%s" in %s' % 40 (binary, os.pathsep.join(bin_search_path)))41 4244 """abstract class for external command based transform 45 46 The external command may read from stdin but must write to stdout 47 If use_stdin is False, a temporary file will be used as input for 48 the command 49 """ 50 51 cmdname = None 52 cmdargs = "" 53 use_stdin = True 54 input_encoding = None 55 #output_encoding = 'utf-8' 5693 9459 if name is not None: 60 self.name = name 61 if binary is not None: 62 self.binary = bin_search(binary) 63 else: 64 self.binary = bin_search(self.cmdname) 65 if cmdargs is not None: 66 self.cmdargs = cmdargs 67 if use_stdin is not None: 68 self.use_stdin = use_stdin6971 return "%s %s" % (self.binary, self.cmdargs)7274 command = self._command_line(trdata) 75 data = trdata.encode(self.input_encoding) 76 if not self.use_stdin: 77 tmpfile, tmpname = mkstemp(text=False) # create tmp 78 os.write(tmpfile, data) # write data to tmp using a file descriptor 79 os.close(tmpfile) # close it so the other process can read it 80 command = command % {'infile' : tmpname} # apply tmp name to command 81 cin, couterr = subprocess.Popen(command, shell=True, stdin=subprocess.PIPE, 82 stdout=subprocess.PIPE, 83 stderr=subprocess.STDOUT, close_fds=True) 84 if self.use_stdin: 85 cin.write(data) 86 status = cin.close() 87 out = couterr.read() 88 couterr.close() 89 if not self.use_stdin: 90 # remove tmp file 91 os.unlink(tmpname) 92 return out.strip()96 name = "pdf_to_text" 97 inputs = ('application/pdf',) 98 output = 'text/plain' 99 output_encoding = 'utf-8' 100 101 cmdname = "pdftotext" 102 cmdargs = "%(infile)s -enc UTF-8 -" 103 use_stdin = False104 105107 name = "lynx_dump" 108 inputs = ('text/html', 'text/xhtml') 109 output = 'text/plain' 110 111 cmdname = "lynx" 112 cmdargs = "-dump -stdin" 113 use_stdin = True 114121 122 123 transform_classes = [pdf_to_text] # , lynx_dump] 124116 encoding = trdata.encoding 117 if encoding == 'ascii': 118 encoding = 'iso-8859-1' # lynx doesn't know ascii ! 119 return '%s %s -assume_charset=%s -display_charset=%s' % ( 120 self.binary, self.cmdargs, encoding, encoding)
Home | Trees | Indices | Help |
|
---|
Generated by Epydoc 3.0.1 on Sat Jun 23 22:15:08 2012 | http://epydoc.sourceforge.net |