Tuesday, February 8, 2011

Convert PDF in Text format

import popen2
from StringIO import StringIO
class InputStreamReader(object):

def __init__(self, inputStream, encoding):

super(InputStreamReader, self).__init__()
self.inputStream = inputStream
self.encoding = encoding or 'utf-8'

def _read(self, length):

return self.inputStream.read(length)

def read(self, length=-1):

text = self._read(length)
text = unicode(text, self.encoding)
return text

def close(self):

self.inputStream.close()

process = popen2.Popen4(["pdftotext", "-enc", "UTF-8", 'Full_Path', "-"])
data=InputStreamReader(process.fromchild, 'utf-8')._read(-1)
print data

No comments:

Post a Comment