12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061 |
- #sudo apt install tesseract-ocr
- #pip pip install pytesseract
- from PIL import Image
- import pytesseract
- # If you don't have tesseract executable in your PATH, include the following:
- #pytesseract.pytesseract.tesseract_cmd = r'<full_path_to_your_tesseract_executable>'
- # Example tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract'
- # Simple image to string
- print ('Test1:')
- print (pytesseract.image_to_string(Image.open('test1.png')))
- print ('Test2:')
- print (pytesseract.image_to_string(Image.open('test2.jpg')))
- print ('Test3:')
- print (pytesseract.image_to_string(Image.open('test3.jpg')))
- exit()
- # In order to bypass the image conversions of pytesseract, just use relative or absolute image path
- # NOTE: In this case you should provide tesseract supported images or tesseract will return error
- print(pytesseract.image_to_string('test.png'))
- # List of available languages
- print(pytesseract.get_languages(config=''))
- # French text image to string
- print(pytesseract.image_to_string(Image.open('test-european.jpg'), lang='fra'))
- # Batch processing with a single file containing the list of multiple image file paths
- print(pytesseract.image_to_string('images.txt'))
- # Timeout/terminate the tesseract job after a period of time
- try:
- print(pytesseract.image_to_string('test.jpg', timeout=2)) # Timeout after 2 seconds
- print(pytesseract.image_to_string('test.jpg', timeout=0.5)) # Timeout after half a second
- except RuntimeError as timeout_error:
- # Tesseract processing is terminated
- pass
- # Get bounding box estimates
- print(pytesseract.image_to_boxes(Image.open('test.png')))
- # Get verbose data including boxes, confidences, line and page numbers
- print(pytesseract.image_to_data(Image.open('test.png')))
- # Get information about orientation and script detection
- print(pytesseract.image_to_osd(Image.open('test.png')))
- # Get a searchable PDF
- pdf = pytesseract.image_to_pdf_or_hocr('test.png', extension='pdf')
- with open('test.pdf', 'w+b') as f:
- f.write(pdf) # pdf type is bytes by default
- # Get HOCR output
- hocr = pytesseract.image_to_pdf_or_hocr('test.png', extension='hocr')
- # Get ALTO XML output
- xml = pytesseract.image_to_alto_xml('test.png')
|