From 5c433c3d8d57cfa3ef95e49d3604471fccb02a83 Mon Sep 17 00:00:00 2001
From: Mike Ganbold <munkhbayarg@google.com>
Date: Mon, 30 Nov 2020 16:54:48 -0800
Subject: [PATCH 1/5] fix: added if statement to filter out dir blob files

---
 .../batch_process_documents_sample_v1beta3.py |  35 +++---
 ...localImageExtractionGoogleDocumentModel.py | 100 ++++++++++++++++++
 samples/snippets/test.txt                     |   0
 3 files changed, 118 insertions(+), 17 deletions(-)
 create mode 100644 samples/snippets/localImageExtractionGoogleDocumentModel.py
 create mode 100644 samples/snippets/test.txt

diff --git a/samples/snippets/batch_process_documents_sample_v1beta3.py b/samples/snippets/batch_process_documents_sample_v1beta3.py
index 6e22e0ea..cce03fc0 100644
--- a/samples/snippets/batch_process_documents_sample_v1beta3.py
+++ b/samples/snippets/batch_process_documents_sample_v1beta3.py
@@ -78,23 +78,24 @@ def batch_process_documents(
 
     for i, blob in enumerate(blob_list):
         # Download the contents of this blob as a bytes object.
-        blob_as_bytes = blob.download_as_bytes()
-        document = documentai.types.Document.from_json(blob_as_bytes)
-
-        print(f"Fetched file {i + 1}")
-
-        # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document
-
-        # Read the text recognition output from the processor
-        for page in document.pages:
-            for form_field in page.form_fields:
-                field_name = get_text(form_field.field_name, document)
-                field_value = get_text(form_field.field_value, document)
-                print("Extracted key value pair:")
-                print(f"\t{field_name}, {field_value}")
-            for paragraph in document.pages:
-                paragraph_text = get_text(paragraph.layout, document)
-                print(f"Paragraph text:\n{paragraph_text}")
+        if '.json' in blob.name:
+            blob_as_bytes = blob.download_as_bytes()
+            
+            document = documentai.types.Document.from_json(blob_as_bytes)
+            print(f"Fetched file {i + 1}")
+
+            # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document
+
+            # Read the text recognition output from the processor
+            for page in document.pages:
+                for form_field in page.form_fields:
+                    field_name = get_text(form_field.field_name, document)
+                    field_value = get_text(form_field.field_value, document)
+                    print("Extracted key value pair:")
+                    print(f"\t{field_name}, {field_value}")
+                for paragraph in document.pages:
+                    paragraph_text = get_text(paragraph.layout, document)
+                    print(f"Paragraph text:\n{paragraph_text}")
 
 
 # Extract shards from the text field
diff --git a/samples/snippets/localImageExtractionGoogleDocumentModel.py b/samples/snippets/localImageExtractionGoogleDocumentModel.py
new file mode 100644
index 00000000..ae8720ba
--- /dev/null
+++ b/samples/snippets/localImageExtractionGoogleDocumentModel.py
@@ -0,0 +1,100 @@
+from google.cloud import vision
+import io
+import os
+import re
+from google.cloud import documentai_v1beta3 as documentai
+from google.cloud import storage
+
+def process_document_sample(project_id: str, location: str, processor_id: str, file_path: str):
+    # Instantiates a client
+    client = documentai.DocumentProcessorServiceClient()
+    #client = DocumentProcessorServiceClientUs()
+    # The full resource name of the processor, e.g.:
+    # projects/project-id/locations/location/processor/processor-id
+    # You must create new processors in the Cloud Console first
+
+    #name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
+    name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
+    #name =  "https://us-documentai.googleapis.com/v1beta3/projects/poc-lab-280103/locations/us/processors/be6f7b359ec71fc9:process"
+    #name = "v1beta3/projects/poc-lab-280103/locations/us/processors/be6f7b359ec71fc9:process"
+    #name = "projects/poc-lab-280103/locations/us"
+    with open(file_path, "rb") as image:
+        image_content = image.read()
+
+    # Read the file into memory
+    document = {"content": image_content, "mime_type": "application/pdf"}
+
+    # Configure the process request
+    request = {"name": name, "document": document}
+    
+    # Recognizes text entities in the PDF document
+    result = client.process_document(request=request)
+    print("Llegue perros")
+    document = result.document
+
+    print("Document processing complete.")
+
+    # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document
+
+    document_pages = document.pages
+
+    # Read the text recognition output from the processor
+    print("The document contains the following paragraphs:")
+    for page in document_pages:
+        paragraphs = page.paragraphs
+        for paragraph in paragraphs:
+            paragraph_text = get_text(paragraph.layout, document)
+            print(f"Paragraph text: {paragraph_text}")
+
+
+# Extract shards from the text field
+def get_text(doc_element: dict, document: dict):
+    """
+    Document AI identifies form fields by their offsets
+    in document text. This function converts offsets
+    to text snippets.
+    """
+    response = ""
+    # If a text segment spans several lines, it will
+    # be stored in different text segments.
+    for segment in doc_element.text_anchor.text_segments:
+        start_index = (
+            int(segment.start_index)
+            if segment in doc_element.text_anchor.text_segments
+            else 0
+        )
+        end_index = int(segment.end_index)
+        response += document.text[start_index:end_index]
+    return response
+
+def parse_invoice(project_id='poc-lab-280103',
+         input_uri='gs://documentos-prueba-ocr/5.pdf'):
+
+    client = documentai.DocumentUnderstandingServiceClient()
+
+    gcs_source = documentai.types.GcsSource(uri=input_uri)
+    print(type(gcs_source))
+    # mime_type can be application/pdf, image/tiff,
+    # and image/gif, or application/json
+    input_config = documentai.types.InputConfig(
+        gcs_source=gcs_source, mime_type='application/pdf')
+
+    # Location can be 'us' or 'eu'
+    parent = 'projects/{}/locations/us'.format(project_id)
+    print(parent)
+    request = documentai.types.ProcessDocumentRequest(
+        parent=parent,
+        input_config=input_config)
+
+    document = client.process_document(request=request)
+
+    # All text extracted from the document
+    print('Document Text: {}'.format(document.text))
+
+#parse_invoice()
+project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
+region = 'us'
+proccesor_id = '90484cfdedb024f6'
+document = 'resources/invoice.pdf'
+process_document_sample(project_id, region ,proccesor_id ,document)
+
diff --git a/samples/snippets/test.txt b/samples/snippets/test.txt
new file mode 100644
index 00000000..e69de29b

From 59742e4601eece50b81fc80cbec77830c7bc57bd Mon Sep 17 00:00:00 2001
From: Mike Ganbold <munkhbayarg@google.com>
Date: Mon, 30 Nov 2020 17:03:42 -0800
Subject: [PATCH 2/5] removed unncessary files

---
 ...localImageExtractionGoogleDocumentModel.py | 100 ------------------
 samples/snippets/test.txt                     |   0
 2 files changed, 100 deletions(-)
 delete mode 100644 samples/snippets/localImageExtractionGoogleDocumentModel.py
 delete mode 100644 samples/snippets/test.txt

diff --git a/samples/snippets/localImageExtractionGoogleDocumentModel.py b/samples/snippets/localImageExtractionGoogleDocumentModel.py
deleted file mode 100644
index ae8720ba..00000000
--- a/samples/snippets/localImageExtractionGoogleDocumentModel.py
+++ /dev/null
@@ -1,100 +0,0 @@
-from google.cloud import vision
-import io
-import os
-import re
-from google.cloud import documentai_v1beta3 as documentai
-from google.cloud import storage
-
-def process_document_sample(project_id: str, location: str, processor_id: str, file_path: str):
-    # Instantiates a client
-    client = documentai.DocumentProcessorServiceClient()
-    #client = DocumentProcessorServiceClientUs()
-    # The full resource name of the processor, e.g.:
-    # projects/project-id/locations/location/processor/processor-id
-    # You must create new processors in the Cloud Console first
-
-    #name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
-    name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
-    #name =  "https://us-documentai.googleapis.com/v1beta3/projects/poc-lab-280103/locations/us/processors/be6f7b359ec71fc9:process"
-    #name = "v1beta3/projects/poc-lab-280103/locations/us/processors/be6f7b359ec71fc9:process"
-    #name = "projects/poc-lab-280103/locations/us"
-    with open(file_path, "rb") as image:
-        image_content = image.read()
-
-    # Read the file into memory
-    document = {"content": image_content, "mime_type": "application/pdf"}
-
-    # Configure the process request
-    request = {"name": name, "document": document}
-    
-    # Recognizes text entities in the PDF document
-    result = client.process_document(request=request)
-    print("Llegue perros")
-    document = result.document
-
-    print("Document processing complete.")
-
-    # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document
-
-    document_pages = document.pages
-
-    # Read the text recognition output from the processor
-    print("The document contains the following paragraphs:")
-    for page in document_pages:
-        paragraphs = page.paragraphs
-        for paragraph in paragraphs:
-            paragraph_text = get_text(paragraph.layout, document)
-            print(f"Paragraph text: {paragraph_text}")
-
-
-# Extract shards from the text field
-def get_text(doc_element: dict, document: dict):
-    """
-    Document AI identifies form fields by their offsets
-    in document text. This function converts offsets
-    to text snippets.
-    """
-    response = ""
-    # If a text segment spans several lines, it will
-    # be stored in different text segments.
-    for segment in doc_element.text_anchor.text_segments:
-        start_index = (
-            int(segment.start_index)
-            if segment in doc_element.text_anchor.text_segments
-            else 0
-        )
-        end_index = int(segment.end_index)
-        response += document.text[start_index:end_index]
-    return response
-
-def parse_invoice(project_id='poc-lab-280103',
-         input_uri='gs://documentos-prueba-ocr/5.pdf'):
-
-    client = documentai.DocumentUnderstandingServiceClient()
-
-    gcs_source = documentai.types.GcsSource(uri=input_uri)
-    print(type(gcs_source))
-    # mime_type can be application/pdf, image/tiff,
-    # and image/gif, or application/json
-    input_config = documentai.types.InputConfig(
-        gcs_source=gcs_source, mime_type='application/pdf')
-
-    # Location can be 'us' or 'eu'
-    parent = 'projects/{}/locations/us'.format(project_id)
-    print(parent)
-    request = documentai.types.ProcessDocumentRequest(
-        parent=parent,
-        input_config=input_config)
-
-    document = client.process_document(request=request)
-
-    # All text extracted from the document
-    print('Document Text: {}'.format(document.text))
-
-#parse_invoice()
-project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
-region = 'us'
-proccesor_id = '90484cfdedb024f6'
-document = 'resources/invoice.pdf'
-process_document_sample(project_id, region ,proccesor_id ,document)
-
diff --git a/samples/snippets/test.txt b/samples/snippets/test.txt
deleted file mode 100644
index e69de29b..00000000

From d22a3a7b5efaed64ec2e447cafcb2ab9b91134c2 Mon Sep 17 00:00:00 2001
From: Mike Ganbold <munkhbayarg@google.com>
Date: Tue, 1 Dec 2020 10:29:43 -0800
Subject: [PATCH 3/5] fixed the lint

---
 samples/snippets/batch_process_documents_sample_v1beta3.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/samples/snippets/batch_process_documents_sample_v1beta3.py b/samples/snippets/batch_process_documents_sample_v1beta3.py
index cce03fc0..fd2a9665 100644
--- a/samples/snippets/batch_process_documents_sample_v1beta3.py
+++ b/samples/snippets/batch_process_documents_sample_v1beta3.py
@@ -78,9 +78,9 @@ def batch_process_documents(
 
     for i, blob in enumerate(blob_list):
         # Download the contents of this blob as a bytes object.
-        if '.json' in blob.name:
+        if ".json" in blob.name:
             blob_as_bytes = blob.download_as_bytes()
-            
+
             document = documentai.types.Document.from_json(blob_as_bytes)
             print(f"Fetched file {i + 1}")
 

From 1f29bb04b8de6dbb153af0481c1def3b4fd48a31 Mon Sep 17 00:00:00 2001
From: Mike Ganbold <munkhbayarg@google.com>
Date: Wed, 2 Dec 2020 14:09:34 -0800
Subject: [PATCH 4/5] made changes according to the feedback

---
 .../batch_process_documents_sample_v1beta3.py | 37 ++++++++++---------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/samples/snippets/batch_process_documents_sample_v1beta3.py b/samples/snippets/batch_process_documents_sample_v1beta3.py
index fd2a9665..1dd3c45e 100644
--- a/samples/snippets/batch_process_documents_sample_v1beta3.py
+++ b/samples/snippets/batch_process_documents_sample_v1beta3.py
@@ -78,24 +78,25 @@ def batch_process_documents(
 
     for i, blob in enumerate(blob_list):
         # Download the contents of this blob as a bytes object.
-        if ".json" in blob.name:
-            blob_as_bytes = blob.download_as_bytes()
-
-            document = documentai.types.Document.from_json(blob_as_bytes)
-            print(f"Fetched file {i + 1}")
-
-            # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document
-
-            # Read the text recognition output from the processor
-            for page in document.pages:
-                for form_field in page.form_fields:
-                    field_name = get_text(form_field.field_name, document)
-                    field_value = get_text(form_field.field_value, document)
-                    print("Extracted key value pair:")
-                    print(f"\t{field_name}, {field_value}")
-                for paragraph in document.pages:
-                    paragraph_text = get_text(paragraph.layout, document)
-                    print(f"Paragraph text:\n{paragraph_text}")
+        if ".json" not in blob.name:
+            return
+        blob_as_bytes = blob.download_as_bytes()
+
+        document = documentai.types.Document.from_json(blob_as_bytes)
+        print(f"Fetched file {i + 1}")
+
+        # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document
+
+        # Read the text recognition output from the processor
+        for page in document.pages:
+            for form_field in page.form_fields:
+                field_name = get_text(form_field.field_name, document)
+                field_value = get_text(form_field.field_value, document)
+                print("Extracted key value pair:")
+                print(f"\t{field_name}, {field_value}")
+            for paragraph in document.pages:
+                paragraph_text = get_text(paragraph.layout, document)
+                print(f"Paragraph text:\n{paragraph_text}")
 
 
 # Extract shards from the text field

From 4998b0443b1ac6422c8ca67fcaffdf34fe82314f Mon Sep 17 00:00:00 2001
From: Mike Ganbold <munkhbayarg@google.com>
Date: Wed, 2 Dec 2020 14:19:07 -0800
Subject: [PATCH 5/5] added helpful comment

---
 samples/snippets/batch_process_documents_sample_v1beta3.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/samples/snippets/batch_process_documents_sample_v1beta3.py b/samples/snippets/batch_process_documents_sample_v1beta3.py
index 1dd3c45e..ea6c01e3 100644
--- a/samples/snippets/batch_process_documents_sample_v1beta3.py
+++ b/samples/snippets/batch_process_documents_sample_v1beta3.py
@@ -80,6 +80,7 @@ def batch_process_documents(
         # Download the contents of this blob as a bytes object.
         if ".json" not in blob.name:
             return
+        # Only parses JSON files
         blob_as_bytes = blob.download_as_bytes()
 
         document = documentai.types.Document.from_json(blob_as_bytes)