From 5c433c3d8d57cfa3ef95e49d3604471fccb02a83 Mon Sep 17 00:00:00 2001 From: Mike Ganbold Date: Mon, 30 Nov 2020 16:54:48 -0800 Subject: [PATCH 1/5] fix: added if statement to filter out dir blob files --- .../batch_process_documents_sample_v1beta3.py | 35 +++--- ...localImageExtractionGoogleDocumentModel.py | 100 ++++++++++++++++++ samples/snippets/test.txt | 0 3 files changed, 118 insertions(+), 17 deletions(-) create mode 100644 samples/snippets/localImageExtractionGoogleDocumentModel.py create mode 100644 samples/snippets/test.txt diff --git a/samples/snippets/batch_process_documents_sample_v1beta3.py b/samples/snippets/batch_process_documents_sample_v1beta3.py index 6e22e0ea..cce03fc0 100644 --- a/samples/snippets/batch_process_documents_sample_v1beta3.py +++ b/samples/snippets/batch_process_documents_sample_v1beta3.py @@ -78,23 +78,24 @@ def batch_process_documents( for i, blob in enumerate(blob_list): # Download the contents of this blob as a bytes object. - blob_as_bytes = blob.download_as_bytes() - document = documentai.types.Document.from_json(blob_as_bytes) - - print(f"Fetched file {i + 1}") - - # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document - - # Read the text recognition output from the processor - for page in document.pages: - for form_field in page.form_fields: - field_name = get_text(form_field.field_name, document) - field_value = get_text(form_field.field_value, document) - print("Extracted key value pair:") - print(f"\t{field_name}, {field_value}") - for paragraph in document.pages: - paragraph_text = get_text(paragraph.layout, document) - print(f"Paragraph text:\n{paragraph_text}") + if '.json' in blob.name: + blob_as_bytes = blob.download_as_bytes() + + document = documentai.types.Document.from_json(blob_as_bytes) + print(f"Fetched file {i + 1}") + + # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document + + # Read the text recognition output from the processor + for page in document.pages: + for form_field in page.form_fields: + field_name = get_text(form_field.field_name, document) + field_value = get_text(form_field.field_value, document) + print("Extracted key value pair:") + print(f"\t{field_name}, {field_value}") + for paragraph in document.pages: + paragraph_text = get_text(paragraph.layout, document) + print(f"Paragraph text:\n{paragraph_text}") # Extract shards from the text field diff --git a/samples/snippets/localImageExtractionGoogleDocumentModel.py b/samples/snippets/localImageExtractionGoogleDocumentModel.py new file mode 100644 index 00000000..ae8720ba --- /dev/null +++ b/samples/snippets/localImageExtractionGoogleDocumentModel.py @@ -0,0 +1,100 @@ +from google.cloud import vision +import io +import os +import re +from google.cloud import documentai_v1beta3 as documentai +from google.cloud import storage + +def process_document_sample(project_id: str, location: str, processor_id: str, file_path: str): + # Instantiates a client + client = documentai.DocumentProcessorServiceClient() + #client = DocumentProcessorServiceClientUs() + # The full resource name of the processor, e.g.: + # projects/project-id/locations/location/processor/processor-id + # You must create new processors in the Cloud Console first + + #name = f"projects/{project_id}/locations/{location}/processors/{processor_id}" + name = f"projects/{project_id}/locations/{location}/processors/{processor_id}" + #name = "https://us-documentai.googleapis.com/v1beta3/projects/poc-lab-280103/locations/us/processors/be6f7b359ec71fc9:process" + #name = "v1beta3/projects/poc-lab-280103/locations/us/processors/be6f7b359ec71fc9:process" + #name = "projects/poc-lab-280103/locations/us" + with open(file_path, "rb") as image: + image_content = image.read() + + # Read the file into memory + document = {"content": image_content, "mime_type": "application/pdf"} + + # Configure the process request + request = {"name": name, "document": document} + + # Recognizes text entities in the PDF document + result = client.process_document(request=request) + print("Llegue perros") + document = result.document + + print("Document processing complete.") + + # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document + + document_pages = document.pages + + # Read the text recognition output from the processor + print("The document contains the following paragraphs:") + for page in document_pages: + paragraphs = page.paragraphs + for paragraph in paragraphs: + paragraph_text = get_text(paragraph.layout, document) + print(f"Paragraph text: {paragraph_text}") + + +# Extract shards from the text field +def get_text(doc_element: dict, document: dict): + """ + Document AI identifies form fields by their offsets + in document text. This function converts offsets + to text snippets. + """ + response = "" + # If a text segment spans several lines, it will + # be stored in different text segments. + for segment in doc_element.text_anchor.text_segments: + start_index = ( + int(segment.start_index) + if segment in doc_element.text_anchor.text_segments + else 0 + ) + end_index = int(segment.end_index) + response += document.text[start_index:end_index] + return response + +def parse_invoice(project_id='poc-lab-280103', + input_uri='gs://documentos-prueba-ocr/5.pdf'): + + client = documentai.DocumentUnderstandingServiceClient() + + gcs_source = documentai.types.GcsSource(uri=input_uri) + print(type(gcs_source)) + # mime_type can be application/pdf, image/tiff, + # and image/gif, or application/json + input_config = documentai.types.InputConfig( + gcs_source=gcs_source, mime_type='application/pdf') + + # Location can be 'us' or 'eu' + parent = 'projects/{}/locations/us'.format(project_id) + print(parent) + request = documentai.types.ProcessDocumentRequest( + parent=parent, + input_config=input_config) + + document = client.process_document(request=request) + + # All text extracted from the document + print('Document Text: {}'.format(document.text)) + +#parse_invoice() +project_id = os.environ["GOOGLE_CLOUD_PROJECT"] +region = 'us' +proccesor_id = '90484cfdedb024f6' +document = 'resources/invoice.pdf' +process_document_sample(project_id, region ,proccesor_id ,document) + diff --git a/samples/snippets/test.txt b/samples/snippets/test.txt new file mode 100644 index 00000000..e69de29b From 59742e4601eece50b81fc80cbec77830c7bc57bd Mon Sep 17 00:00:00 2001 From: Mike Ganbold Date: Mon, 30 Nov 2020 17:03:42 -0800 Subject: [PATCH 2/5] removed unncessary files --- ...localImageExtractionGoogleDocumentModel.py | 100 ------------------ samples/snippets/test.txt | 0 2 files changed, 100 deletions(-) delete mode 100644 samples/snippets/localImageExtractionGoogleDocumentModel.py delete mode 100644 samples/snippets/test.txt diff --git a/samples/snippets/localImageExtractionGoogleDocumentModel.py b/samples/snippets/localImageExtractionGoogleDocumentModel.py deleted file mode 100644 index ae8720ba..00000000 --- a/samples/snippets/localImageExtractionGoogleDocumentModel.py +++ /dev/null @@ -1,100 +0,0 @@ -from google.cloud import vision -import io -import os -import re -from google.cloud import documentai_v1beta3 as documentai -from google.cloud import storage - -def process_document_sample(project_id: str, location: str, processor_id: str, file_path: str): - # Instantiates a client - client = documentai.DocumentProcessorServiceClient() - #client = DocumentProcessorServiceClientUs() - # The full resource name of the processor, e.g.: - # projects/project-id/locations/location/processor/processor-id - # You must create new processors in the Cloud Console first - - #name = f"projects/{project_id}/locations/{location}/processors/{processor_id}" - name = f"projects/{project_id}/locations/{location}/processors/{processor_id}" - #name = "https://us-documentai.googleapis.com/v1beta3/projects/poc-lab-280103/locations/us/processors/be6f7b359ec71fc9:process" - #name = "v1beta3/projects/poc-lab-280103/locations/us/processors/be6f7b359ec71fc9:process" - #name = "projects/poc-lab-280103/locations/us" - with open(file_path, "rb") as image: - image_content = image.read() - - # Read the file into memory - document = {"content": image_content, "mime_type": "application/pdf"} - - # Configure the process request - request = {"name": name, "document": document} - - # Recognizes text entities in the PDF document - result = client.process_document(request=request) - print("Llegue perros") - document = result.document - - print("Document processing complete.") - - # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document - - document_pages = document.pages - - # Read the text recognition output from the processor - print("The document contains the following paragraphs:") - for page in document_pages: - paragraphs = page.paragraphs - for paragraph in paragraphs: - paragraph_text = get_text(paragraph.layout, document) - print(f"Paragraph text: {paragraph_text}") - - -# Extract shards from the text field -def get_text(doc_element: dict, document: dict): - """ - Document AI identifies form fields by their offsets - in document text. This function converts offsets - to text snippets. - """ - response = "" - # If a text segment spans several lines, it will - # be stored in different text segments. - for segment in doc_element.text_anchor.text_segments: - start_index = ( - int(segment.start_index) - if segment in doc_element.text_anchor.text_segments - else 0 - ) - end_index = int(segment.end_index) - response += document.text[start_index:end_index] - return response - -def parse_invoice(project_id='poc-lab-280103', - input_uri='gs://documentos-prueba-ocr/5.pdf'): - - client = documentai.DocumentUnderstandingServiceClient() - - gcs_source = documentai.types.GcsSource(uri=input_uri) - print(type(gcs_source)) - # mime_type can be application/pdf, image/tiff, - # and image/gif, or application/json - input_config = documentai.types.InputConfig( - gcs_source=gcs_source, mime_type='application/pdf') - - # Location can be 'us' or 'eu' - parent = 'projects/{}/locations/us'.format(project_id) - print(parent) - request = documentai.types.ProcessDocumentRequest( - parent=parent, - input_config=input_config) - - document = client.process_document(request=request) - - # All text extracted from the document - print('Document Text: {}'.format(document.text)) - -#parse_invoice() -project_id = os.environ["GOOGLE_CLOUD_PROJECT"] -region = 'us' -proccesor_id = '90484cfdedb024f6' -document = 'resources/invoice.pdf' -process_document_sample(project_id, region ,proccesor_id ,document) - diff --git a/samples/snippets/test.txt b/samples/snippets/test.txt deleted file mode 100644 index e69de29b..00000000 From d22a3a7b5efaed64ec2e447cafcb2ab9b91134c2 Mon Sep 17 00:00:00 2001 From: Mike Ganbold Date: Tue, 1 Dec 2020 10:29:43 -0800 Subject: [PATCH 3/5] fixed the lint --- samples/snippets/batch_process_documents_sample_v1beta3.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/samples/snippets/batch_process_documents_sample_v1beta3.py b/samples/snippets/batch_process_documents_sample_v1beta3.py index cce03fc0..fd2a9665 100644 --- a/samples/snippets/batch_process_documents_sample_v1beta3.py +++ b/samples/snippets/batch_process_documents_sample_v1beta3.py @@ -78,9 +78,9 @@ def batch_process_documents( for i, blob in enumerate(blob_list): # Download the contents of this blob as a bytes object. - if '.json' in blob.name: + if ".json" in blob.name: blob_as_bytes = blob.download_as_bytes() - + document = documentai.types.Document.from_json(blob_as_bytes) print(f"Fetched file {i + 1}") From 1f29bb04b8de6dbb153af0481c1def3b4fd48a31 Mon Sep 17 00:00:00 2001 From: Mike Ganbold Date: Wed, 2 Dec 2020 14:09:34 -0800 Subject: [PATCH 4/5] made changes according to the feedback --- .../batch_process_documents_sample_v1beta3.py | 37 ++++++++++--------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/samples/snippets/batch_process_documents_sample_v1beta3.py b/samples/snippets/batch_process_documents_sample_v1beta3.py index fd2a9665..1dd3c45e 100644 --- a/samples/snippets/batch_process_documents_sample_v1beta3.py +++ b/samples/snippets/batch_process_documents_sample_v1beta3.py @@ -78,24 +78,25 @@ def batch_process_documents( for i, blob in enumerate(blob_list): # Download the contents of this blob as a bytes object. - if ".json" in blob.name: - blob_as_bytes = blob.download_as_bytes() - - document = documentai.types.Document.from_json(blob_as_bytes) - print(f"Fetched file {i + 1}") - - # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document - - # Read the text recognition output from the processor - for page in document.pages: - for form_field in page.form_fields: - field_name = get_text(form_field.field_name, document) - field_value = get_text(form_field.field_value, document) - print("Extracted key value pair:") - print(f"\t{field_name}, {field_value}") - for paragraph in document.pages: - paragraph_text = get_text(paragraph.layout, document) - print(f"Paragraph text:\n{paragraph_text}") + if ".json" not in blob.name: + return + blob_as_bytes = blob.download_as_bytes() + + document = documentai.types.Document.from_json(blob_as_bytes) + print(f"Fetched file {i + 1}") + + # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document + + # Read the text recognition output from the processor + for page in document.pages: + for form_field in page.form_fields: + field_name = get_text(form_field.field_name, document) + field_value = get_text(form_field.field_value, document) + print("Extracted key value pair:") + print(f"\t{field_name}, {field_value}") + for paragraph in document.pages: + paragraph_text = get_text(paragraph.layout, document) + print(f"Paragraph text:\n{paragraph_text}") # Extract shards from the text field From 4998b0443b1ac6422c8ca67fcaffdf34fe82314f Mon Sep 17 00:00:00 2001 From: Mike Ganbold Date: Wed, 2 Dec 2020 14:19:07 -0800 Subject: [PATCH 5/5] added helpful comment --- samples/snippets/batch_process_documents_sample_v1beta3.py | 1 + 1 file changed, 1 insertion(+) diff --git a/samples/snippets/batch_process_documents_sample_v1beta3.py b/samples/snippets/batch_process_documents_sample_v1beta3.py index 1dd3c45e..ea6c01e3 100644 --- a/samples/snippets/batch_process_documents_sample_v1beta3.py +++ b/samples/snippets/batch_process_documents_sample_v1beta3.py @@ -80,6 +80,7 @@ def batch_process_documents( # Download the contents of this blob as a bytes object. if ".json" not in blob.name: return + # Only parses JSON files blob_as_bytes = blob.download_as_bytes() document = documentai.types.Document.from_json(blob_as_bytes)