| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061 | 
							- #sudo apt install tesseract-ocr
 
- #pip pip install pytesseract
 
- from PIL import Image
 
- import pytesseract
 
- # If you don't have tesseract executable in your PATH, include the following:
 
- #pytesseract.pytesseract.tesseract_cmd = r'<full_path_to_your_tesseract_executable>'
 
- # Example tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract'
 
- # Simple image to string
 
- print ('Test1:')
 
- print (pytesseract.image_to_string(Image.open('test1.png')))
 
- print ('Test2:')
 
- print (pytesseract.image_to_string(Image.open('test2.jpg')))
 
- print ('Test3:')
 
- print (pytesseract.image_to_string(Image.open('test3.jpg')))
 
- exit()
 
- # In order to bypass the image conversions of pytesseract, just use relative or absolute image path
 
- # NOTE: In this case you should provide tesseract supported images or tesseract will return error
 
- print(pytesseract.image_to_string('test.png'))
 
- # List of available languages
 
- print(pytesseract.get_languages(config=''))
 
- # French text image to string
 
- print(pytesseract.image_to_string(Image.open('test-european.jpg'), lang='fra'))
 
- # Batch processing with a single file containing the list of multiple image file paths
 
- print(pytesseract.image_to_string('images.txt'))
 
- # Timeout/terminate the tesseract job after a period of time
 
- try:
 
-     print(pytesseract.image_to_string('test.jpg', timeout=2)) # Timeout after 2 seconds
 
-     print(pytesseract.image_to_string('test.jpg', timeout=0.5)) # Timeout after half a second
 
- except RuntimeError as timeout_error:
 
-     # Tesseract processing is terminated
 
-     pass
 
- # Get bounding box estimates
 
- print(pytesseract.image_to_boxes(Image.open('test.png')))
 
- # Get verbose data including boxes, confidences, line and page numbers
 
- print(pytesseract.image_to_data(Image.open('test.png')))
 
- # Get information about orientation and script detection
 
- print(pytesseract.image_to_osd(Image.open('test.png')))
 
- # Get a searchable PDF
 
- pdf = pytesseract.image_to_pdf_or_hocr('test.png', extension='pdf')
 
- with open('test.pdf', 'w+b') as f:
 
-     f.write(pdf) # pdf type is bytes by default
 
- # Get HOCR output
 
- hocr = pytesseract.image_to_pdf_or_hocr('test.png', extension='hocr')
 
- # Get ALTO XML output
 
- xml = pytesseract.image_to_alto_xml('test.png')
 
 
  |