Skip to content

Commit

Permalink
Merge pull request #165 from ekeydar/master
Browse files Browse the repository at this point in the history
add toFilename to handle translation of url to filename
  • Loading branch information
chrismattmann committed Nov 21, 2017
2 parents 5f107aa + bfd8606 commit 7c41923
Showing 1 changed file with 7 additions and 2 deletions.
9 changes: 7 additions & 2 deletions tika/tika.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@
"""

import sys, os, getopt, time, codecs
import sys, os, getopt, time, codecs, re
try:
unicode_string = unicode
binary_string = str
Expand Down Expand Up @@ -609,6 +609,11 @@ def startServer(tikaServerJar, serverHost = ServerHost, port = Port, classpath=N
cmd = Popen(cmd , stdout= logFile, stderr = STDOUT, shell =True)
time.sleep(5)

def toFilename(urlOrPath):
value = re.sub('[^\w\s-]', '-', urlOrPath).strip().lower()
return re.sub('[-\s]+', '-', value).strip("-")


def getRemoteFile(urlOrPath, destPath):
'''
Fetches URL to local path or just returns absolute path.
Expand All @@ -622,7 +627,7 @@ def getRemoteFile(urlOrPath, destPath):
elif urlp.scheme not in ('http', 'https'):
return (urlOrPath, 'local')
else:
filename = urlOrPath.rsplit('/',1)[1]
filename = toFilename(urlOrPath)
destPath = destPath + '/' +filename
log.info('Retrieving %s to %s.' % (urlOrPath, destPath))
try:
Expand Down

0 comments on commit 7c41923

Please sign in to comment.