| 1 | #!/usr/bin/env python |
|---|
| 2 | |
|---|
| 3 | import getopt |
|---|
| 4 | import shutil |
|---|
| 5 | import sys |
|---|
| 6 | import uno |
|---|
| 7 | from os import getcwd |
|---|
| 8 | from os.path import splitext |
|---|
| 9 | |
|---|
| 10 | from unohelper import Base, systemPathToFileUrl, absolutize |
|---|
| 11 | from com.sun.star.beans import PropertyValue |
|---|
| 12 | from com.sun.star.uno import Exception as UnoException |
|---|
| 13 | from com.sun.star.io import IOException, XOutputStream |
|---|
| 14 | |
|---|
| 15 | |
|---|
| 16 | class OutputStream(Base, XOutputStream): |
|---|
| 17 | |
|---|
| 18 | def __init__(self): |
|---|
| 19 | self.closed = 0 |
|---|
| 20 | |
|---|
| 21 | def closeOutput(self): |
|---|
| 22 | self.closed = 1 |
|---|
| 23 | |
|---|
| 24 | def writeBytes(self, seq): |
|---|
| 25 | sys.stdout.write(seq.value) |
|---|
| 26 | |
|---|
| 27 | def flush(self): |
|---|
| 28 | pass |
|---|
| 29 | |
|---|
| 30 | |
|---|
| 31 | def main(): |
|---|
| 32 | retVal = 0 |
|---|
| 33 | doc = None |
|---|
| 34 | stdout = False |
|---|
| 35 | |
|---|
| 36 | try: |
|---|
| 37 | opts, args = getopt.getopt(sys.argv[1:], "hc:", |
|---|
| 38 | ["help", "connection-string=", "html", "pdf", "stdout", "format="]) |
|---|
| 39 | url = "uno:socket,host=localhost,port=2002;urp;StarOffice.ComponentContext" |
|---|
| 40 | filterName = "Text (Encoded)" |
|---|
| 41 | extension = "txt" |
|---|
| 42 | for o, a in opts: |
|---|
| 43 | if o in ("-h", "--help"): |
|---|
| 44 | usage() |
|---|
| 45 | sys.exit() |
|---|
| 46 | if o in ("-c", "--connection-string" ): |
|---|
| 47 | url = "uno:" + a + ";urp;StarOffice.ComponentContext" |
|---|
| 48 | if o == "--html": |
|---|
| 49 | filterName = "HTML (StarWriter)" |
|---|
| 50 | extension = "html" |
|---|
| 51 | if o == "--pdf": |
|---|
| 52 | filterName = "writer_pdf_Export" |
|---|
| 53 | extension = "pdf" |
|---|
| 54 | if o in ("--format"): |
|---|
| 55 | if a == "xls": |
|---|
| 56 | filterName = "HTML (StarCalc)" |
|---|
| 57 | if a == "doc": |
|---|
| 58 | filterName = "HTML (StarWriter)" |
|---|
| 59 | if a == "ppt": |
|---|
| 60 | filterName = "impress_html_Export" |
|---|
| 61 | if o == "--stdout": |
|---|
| 62 | stdout = True |
|---|
| 63 | |
|---|
| 64 | if not len(args): |
|---|
| 65 | usage() |
|---|
| 66 | sys.exit() |
|---|
| 67 | |
|---|
| 68 | ctxLocal = uno.getComponentContext() |
|---|
| 69 | smgrLocal = ctxLocal.ServiceManager |
|---|
| 70 | |
|---|
| 71 | resolver = smgrLocal.createInstanceWithContext( |
|---|
| 72 | "com.sun.star.bridge.UnoUrlResolver", ctxLocal) |
|---|
| 73 | ctx = resolver.resolve(url) |
|---|
| 74 | smgr = ctx.ServiceManager |
|---|
| 75 | |
|---|
| 76 | desktop = smgr.createInstanceWithContext("com.sun.star.frame.Desktop", |
|---|
| 77 | ctx) |
|---|
| 78 | |
|---|
| 79 | cwd = systemPathToFileUrl(getcwd()) |
|---|
| 80 | outProps = ( |
|---|
| 81 | PropertyValue("FilterName", 0, filterName, 0), |
|---|
| 82 | PropertyValue("Overwrite", 0, True, 0), |
|---|
| 83 | PropertyValue("OutputStream", 0, OutputStream(), 0)) |
|---|
| 84 | |
|---|
| 85 | inProps = PropertyValue("Hidden", 0, True, 0), |
|---|
| 86 | for path in args: |
|---|
| 87 | try: |
|---|
| 88 | fileUrl = absolutize( cwd, systemPathToFileUrl(path) ) |
|---|
| 89 | doc = desktop.loadComponentFromURL(fileUrl, "_blank", 0, inProps) |
|---|
| 90 | |
|---|
| 91 | if not doc: |
|---|
| 92 | raise UnoException("Couldn't open stream for unknown reason", None) |
|---|
| 93 | |
|---|
| 94 | dest, ext = splitext(path) |
|---|
| 95 | dest = dest + "." + extension |
|---|
| 96 | destUrl = absolutize(cwd, systemPathToFileUrl(dest)) |
|---|
| 97 | sys.stderr.write("output url %s \n" % destUrl) |
|---|
| 98 | doc.storeToURL(destUrl, outProps) |
|---|
| 99 | if stdout: |
|---|
| 100 | fn = destUrl[7:] |
|---|
| 101 | sys.stderr.write("output file %s \n" % fn) |
|---|
| 102 | f = open(fn, 'rb') |
|---|
| 103 | shutil.copyfileobj(f, sys.stdout.buffer) |
|---|
| 104 | f.close() |
|---|
| 105 | except IOException, e: |
|---|
| 106 | sys.stderr.write("Error during conversion: " + e.Message + "\n") |
|---|
| 107 | retVal = 1 |
|---|
| 108 | except UnoException, e: |
|---|
| 109 | sys.stderr.write("Error ("+repr(e.__class__)+") during conversion:" + e.Message + "\n") |
|---|
| 110 | retVal = 1 |
|---|
| 111 | if doc: |
|---|
| 112 | doc.dispose() |
|---|
| 113 | |
|---|
| 114 | except UnoException, e: |
|---|
| 115 | sys.stderr.write("Error (" + repr(e.__class__) + ") :" + e.Message + "\n") |
|---|
| 116 | retVal = 1 |
|---|
| 117 | except getopt.GetoptError, e: |
|---|
| 118 | sys.stderr.write(str(e) + "\n") |
|---|
| 119 | usage() |
|---|
| 120 | retVal = 1 |
|---|
| 121 | |
|---|
| 122 | sys.exit(retVal) |
|---|
| 123 | |
|---|
| 124 | |
|---|
| 125 | def usage(): |
|---|
| 126 | sys.stderr.write("usage: ooextract.py --help | --stdout\n"+ |
|---|
| 127 | " [-c <connection-string> | --connection-string=<connection-string>\n"+ |
|---|
| 128 | " [--html|--pdf]\n"+ |
|---|
| 129 | " [--stdout]\n"+ |
|---|
| 130 | " file1 file2 ...\n"+ |
|---|
| 131 | "\n" + |
|---|
| 132 | "Extracts plain text from documents and prints it to a file (unless --stdout is specified).\n" + |
|---|
| 133 | "Requires an OpenOffice.org instance to be running. The script and the\n"+ |
|---|
| 134 | "running OpenOffice.org instance must be able to access the file with\n"+ |
|---|
| 135 | "by the same system path. [ To have a listening OpenOffice.org instance, just run:\n"+ |
|---|
| 136 | "openoffice \"-accept=socket,host=localhost,port=2002;urp;\" \n" |
|---|
| 137 | "\n"+ |
|---|
| 138 | "--stdout \n" + |
|---|
| 139 | " Redirect output to stdout. Avoids writing to a file directly\n" + |
|---|
| 140 | "-c <connection-string> | --connection-string=<connection-string>\n" + |
|---|
| 141 | " The connection-string part of a uno url to where the\n" + |
|---|
| 142 | " the script should connect to in order to do the conversion.\n" + |
|---|
| 143 | " The strings defaults to socket,host=localhost,port=2002\n" |
|---|
| 144 | "--html \n" |
|---|
| 145 | " Instead of the text filter, the writer html filter is used\n" |
|---|
| 146 | "--pdf \n" |
|---|
| 147 | " Instead of the text filter, the pdf filter is used\n") |
|---|
| 148 | |
|---|
| 149 | main() |
|---|