ecatkins · donknight · Sep 14, 2017 · Sep 14, 2017 · Sep 14, 2017 · Sep 14, 2017
diff --git a/xpdf_python/check_xpdf.py b/xpdf_python/check_xpdf.py
@@ -1,7 +1,6 @@
 import os
 import sys
 
-if os.path.isfile('/usr/local/bin/pdftotext'):
-	pass
-else:
-	sys.exit("Did not detect correctly installed xpdf. Please follow install instructions at: https://github.com/ecatkins/xpdf_python.")
+# should cover windows, linux, or mac distros using either the pdftotext distro or the XPDF tools distro
+if not any(i in os.environ['PATH'] for i in ('pdftotext', 'XPDF')):
+	sys.exit("Did not detect correctly installed xpdf. Please follow install instructions at: https://github.com/ecatkins/xpdf_python.")
diff --git a/xpdf_python/wrapper.py b/xpdf_python/wrapper.py
@@ -13,13 +13,14 @@ def countPages(filename):
 	data = open(filename,"r", encoding = "ISO-8859-1").read()
 	return len(rxcountpages.findall(data))
 
-def to_text(file_loc, page_nums = True):
+def to_text(file_loc, page_nums=True, options=()):
 	''' Converts PDF to text
 
 	Args
 	- - - - - - -
 		file_loc: path to pdf document, string
 		page_nums: whether to insert page numbers into document, boolean
+		options: allows the addition of any of the normal options accepted by pdftotext, tuple of strings
 
 	Returns
 	- - - - - - -
@@ -35,35 +36,45 @@ def to_text(file_loc, page_nums = True):
 		cd = os.getcwd()
 		full_file_loc = os.path.join(cd, file_loc)
 
+	path, file = os.path.split(full_file_loc)
+	saved_file = os.path.join(path, os.path.splitext(file)[0] + '.txt')
+
 	text = ''
 	actual_count = 0
 
 	# If page numbers are to be inserted
 	if page_nums:
-		# Count number of pages
 		num = countPages(full_file_loc)
 		# Accounts for errors occuring in countPages function
 		if num == 0:
 			num = 100
-		for i in range(num):
-			actual = i + 1
-			# Calls xpdf 
-			subprocess.call(['pdftotext', '-f', str(actual),'-l', str(actual), full_file_loc])
-			# Opens file saved to disk 
-			saved_file = full_file_loc.replace('.pdf','.txt')
-			file = open(saved_file,'r', encoding = "ISO-8859-1")
+	else:
+		# accounts for not adding page numbers by allowing the loop to go just one bulk action
+		num = 1
+		actual_count = countPages(full_file_loc)  # try and provide a page estimate on bulk runs
+
+	for i in range(num):
+		actual = i + 1
+		opt = options
+		if page_nums:
+			opt += ('-f', str(actual), '-l', str(actual))
+
+		# Calls xpdf
+		subprocess.call(['pdftotext', *opt, full_file_loc])
+
+		# Opens file saved to disk, ensures it will always close when done
+		with open(saved_file, 'r', encoding='ISO-8859-1') as file:
 			t = file.read()
-			# If the page is blank, it is not a real page
-			if t == '':
-				continue
-			else:
-				actual_count += 1
-			# Add text and page count to existing string
+		# If the page is blank, it is not a real page
+		if t == '':
+			continue
+		if page_nums:
+			actual_count += 1
+		# Add text and page count to existing string, or not it not page_nums
+		if page_nums:
 			text += '***Page {}*** {}'.format(actual, t)
-			file.close()
-	else:
-		# TO BE IMPLEMENTED
-		pass
+		else:
+			text = t
 
 	# Remove file saved to disk
 	os.remove(saved_file)