FRN451 = open('F451.pdf', 'rb')
F451 = PyPDF2.PdfFileReader(FRN451)
#Discerning the number of pages will allow us to parse through all the pages.
num_pages = F451.numPages
count = 0
text = ""
#The while loop will read each page.
while count < num_pages:
pageObj = F451.getPage(count)
count +=1
text += pageObj.extractText()
text = text.lower()
#This if statement exists to check if the above library returned words. It's done because PyPDF2 cannot read scanned files.
if text != "":
text = text
#If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text.
else:
text = textract.process(fileurl, method='tesseract', language='eng')