read_text1.py 2.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
  1. #sudo apt install tesseract-ocr
  2. #pip pip install pytesseract
  3. from PIL import Image
  4. import pytesseract
  5. # If you don't have tesseract executable in your PATH, include the following:
  6. #pytesseract.pytesseract.tesseract_cmd = r'<full_path_to_your_tesseract_executable>'
  7. # Example tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract'
  8. # Simple image to string
  9. print ('Test1:')
  10. print (pytesseract.image_to_string(Image.open('test1.png')))
  11. print ('Test2:')
  12. print (pytesseract.image_to_string(Image.open('test2.jpg')))
  13. print ('Test3:')
  14. print (pytesseract.image_to_string(Image.open('test3.jpg')))
  15. exit()
  16. # In order to bypass the image conversions of pytesseract, just use relative or absolute image path
  17. # NOTE: In this case you should provide tesseract supported images or tesseract will return error
  18. print(pytesseract.image_to_string('test.png'))
  19. # List of available languages
  20. print(pytesseract.get_languages(config=''))
  21. # French text image to string
  22. print(pytesseract.image_to_string(Image.open('test-european.jpg'), lang='fra'))
  23. # Batch processing with a single file containing the list of multiple image file paths
  24. print(pytesseract.image_to_string('images.txt'))
  25. # Timeout/terminate the tesseract job after a period of time
  26. try:
  27. print(pytesseract.image_to_string('test.jpg', timeout=2)) # Timeout after 2 seconds
  28. print(pytesseract.image_to_string('test.jpg', timeout=0.5)) # Timeout after half a second
  29. except RuntimeError as timeout_error:
  30. # Tesseract processing is terminated
  31. pass
  32. # Get bounding box estimates
  33. print(pytesseract.image_to_boxes(Image.open('test.png')))
  34. # Get verbose data including boxes, confidences, line and page numbers
  35. print(pytesseract.image_to_data(Image.open('test.png')))
  36. # Get information about orientation and script detection
  37. print(pytesseract.image_to_osd(Image.open('test.png')))
  38. # Get a searchable PDF
  39. pdf = pytesseract.image_to_pdf_or_hocr('test.png', extension='pdf')
  40. with open('test.pdf', 'w+b') as f:
  41. f.write(pdf) # pdf type is bytes by default
  42. # Get HOCR output
  43. hocr = pytesseract.image_to_pdf_or_hocr('test.png', extension='hocr')
  44. # Get ALTO XML output
  45. xml = pytesseract.image_to_alto_xml('test.png')