Package python-module-logilab-mtconverter-0 :: Package 8 :: Package 2 :: Package transforms :: Module cmdtransforms
[frames] | no frames]

Source Code for Module python-module-logilab-mtconverter-0.8.2.transforms.cmdtransforms

  1  # copyright 2006-2011 LOGILAB S.A. (Paris, FRANCE), all rights reserved. 
  2  # contact http://www.logilab.fr/ -- mailto:contact@logilab.fr 
  3  # 
  4  # This file is part of logilab-mtconverter. 
  5  # 
  6  # logilab-mtconverter is free software: you can redistribute it and/or modify it 
  7  # under the terms of the GNU Lesser General Public License as published by the 
  8  # Free Software Foundation, either version 2.1 of the License, or (at your 
  9  # option) any later version. 
 10  # 
 11  # logilab-mtconverter is distributed in the hope that it will be useful, but 
 12  # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 
 13  # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License 
 14  # for more details. 
 15  # 
 16  # You should have received a copy of the GNU Lesser General Public License along 
 17  # with logilab-mtconverter. If not, see <http://www.gnu.org/licenses/>. 
 18  import os 
 19  from tempfile import mkstemp 
 20  import subprocess 
 21   
 22  from logilab.mtconverter import MissingBinary 
 23  from logilab.mtconverter.transform import Transform 
 24   
 25  bin_search_path = [path for path in os.environ['PATH'].split(os.pathsep) 
 26                     if os.path.isdir(path)] 
 27   
 28   
29 -def bin_search(binary):
30 """search the bin_search_path for a given binary returning its fullname or 31 raises MissingBinary""" 32 result = None 33 mode = os.R_OK | os.X_OK 34 for path in bin_search_path: 35 pathbin = os.path.join(path, binary) 36 if os.access(pathbin, mode) == 1: 37 return pathbin 38 break 39 raise MissingBinary('Unable to find binary "%s" in %s' % 40 (binary, os.pathsep.join(bin_search_path)))
41 42
43 -class POpenTransform(Transform):
44 """abstract class for external command based transform 45 46 The external command may read from stdin but must write to stdout 47 If use_stdin is False, a temporary file will be used as input for 48 the command 49 """ 50 51 cmdname = None 52 cmdargs = "" 53 use_stdin = True 54 input_encoding = None 55 #output_encoding = 'utf-8' 56
57 - def __init__(self, name=None, binary=None, cmdargs=None, use_stdin=None, 58 **kwargs):
59 if name is not None: 60 self.name = name 61 if binary is not None: 62 self.binary = bin_search(binary) 63 else: 64 self.binary = bin_search(self.cmdname) 65 if cmdargs is not None: 66 self.cmdargs = cmdargs 67 if use_stdin is not None: 68 self.use_stdin = use_stdin
69
70 - def _command_line(self, trdata):
71 return "%s %s" % (self.binary, self.cmdargs)
72
73 - def _convert(self, trdata):
74 command = self._command_line(trdata) 75 data = trdata.encode(self.input_encoding) 76 if not self.use_stdin: 77 tmpfile, tmpname = mkstemp(text=False) # create tmp 78 os.write(tmpfile, data) # write data to tmp using a file descriptor 79 os.close(tmpfile) # close it so the other process can read it 80 command = command % {'infile' : tmpname} # apply tmp name to command 81 cin, couterr = subprocess.Popen(command, shell=True, stdin=subprocess.PIPE, 82 stdout=subprocess.PIPE, 83 stderr=subprocess.STDOUT, close_fds=True) 84 if self.use_stdin: 85 cin.write(data) 86 status = cin.close() 87 out = couterr.read() 88 couterr.close() 89 if not self.use_stdin: 90 # remove tmp file 91 os.unlink(tmpname) 92 return out.strip()
93 94
95 -class pdf_to_text(POpenTransform):
96 name = "pdf_to_text" 97 inputs = ('application/pdf',) 98 output = 'text/plain' 99 output_encoding = 'utf-8' 100 101 cmdname = "pdftotext" 102 cmdargs = "%(infile)s -enc UTF-8 -" 103 use_stdin = False
104 105
106 -class lynx_dump(POpenTransform):
107 name = "lynx_dump" 108 inputs = ('text/html', 'text/xhtml') 109 output = 'text/plain' 110 111 cmdname = "lynx" 112 cmdargs = "-dump -stdin" 113 use_stdin = True 114
115 - def _command_line(self, trdata):
116 encoding = trdata.encoding 117 if encoding == 'ascii': 118 encoding = 'iso-8859-1' # lynx doesn't know ascii ! 119 return '%s %s -assume_charset=%s -display_charset=%s' % ( 120 self.binary, self.cmdargs, encoding, encoding)
121 122 123 transform_classes = [pdf_to_text] # , lynx_dump] 124