Python Sample Notebook

Options

dc_api_url = "https://knowledge-enrichment.ai.experience.hyland.com/latest/api/data-curation"
dc_options = {
    "normalization": {
        "quotations": True, # Normalize quotation marks
        "dashes": True # Normalize dashes
    },
    "chunking": True, # Enables chunking of the text
    "chunk_size": 1000, # Desired size of each text chunk (may be longer to prevent breaking sentences/paragraphs)
    "embedding": False, # Generates text embeddings for the chunks (chunking is required)
    "json_schema": False # "FULL", "MDAST", "PIPELINE", or False
    "pii" = False # "redaction", "detection", or false
}

Credentials

# OAuth client credentials need to be cofigured in the admin console.
# https://admin.experience.hyland.com/external-systems/external-applications
# Ensure the external application has the `environment_authorization` scope and is configured for an environment
# that has the `cin-data-curation-api` application (environment has a Data Curation API subscription).
oauth_url = "https://auth.iam.experience.hyland.com/idp"
oauth_client_id = "<<REDACTED>>"
oauth_client_secret = "<<REDACTED>>"

Helper Functions

import base64
import datetime
from IPython.display import display, HTML, Markdown
import json
import os
from urllib.error import HTTPError
from urllib.parse import urlencode
from urllib.request import Request, urlopen

def send_http_request(method: str, url: str, headers: dict = {}, data: bytes | None = None) -> dict:
    """Sends an HTTP request and returns the response body as a dictionary."""
    request = Request(url=url, data=data, headers=headers, method=method)
    print(f"Sending request to: {request.method} {request.full_url}")
    response_body = {}
    try:
        response = urlopen(request)
        response_bytes = response.read()
        if len(response_bytes) > 0:
            response_body = json.loads(response_bytes.decode())
    except HTTPError as e:
        display(f"HTTP Error: {e.code} - {e.read().decode()}")
        if e.code != 404:
            raise
    finally:
        if "response" in locals():
            response.close()

    return response_body

def send_get_request(url: str, headers: dict = {}) -> dict:
    """Sends a GET request to the specified URL with optional headers."""
    return send_http_request(method="GET", url=url, headers=headers)

def send_post_request(url: str, headers: dict = {}, data: bytes | None = None) -> dict:
    """Sends a POST request to the specified URL with optional headers and data."""
    return send_http_request(method="POST", url=url, headers=headers, data=data)

def send_json_post_request(url: str, headers: dict = {}, data: dict | None = None) -> dict:
    """Sends a POST request to the specified URL with optional headers and JSON data."""
    headers["Content-Type"] = "application/json"
    json_data = json.dumps(data).encode()
    return send_post_request(url=url, data=json_data, headers=headers)

def send_form_post_request(url: str, headers: dict = {}, data: dict | None = None) -> dict:
    """Sends a POST request to the specified URL with optional headers and form data."""
    headers["Content-Type"] = "application/x-www-form-urlencoded"
    form_data = urlencode(data, doseq=True).encode()
    return send_post_request(url=url, data=form_data, headers=headers)

def send_put_request(url: str, headers: dict = {}, data: bytes | None = None) -> dict:
    """Sends a PUT request to the specified URL with optional headers and data."""
    return send_http_request(method="PUT", url=url, headers=headers, data=data)

def refresh_access_token() -> str | None:
    """Refreshes the access token using the OAuth client credentials."""
    print("Getting a new access token")
    access_token = None
    headers = {
        "Accept": "application/json",
        "Authorization": f"Basic {base64_encode(f"{oauth_client_id}:{oauth_client_secret}")}"
    }
    body = {
        "grant_type": "client_credentials",
        "scope": "environment_authorization"
    }
    token_url = f"{oauth_url}/connect/token"
    token_response = send_form_post_request(url=token_url, data=body, headers=headers)
    access_token = token_response.get("access_token")
    is_access_token_valid(access_token)

    return access_token

def decode_access_token(token: str) -> dict | None:
    """Decodes the access token to extract its JSON payload."""
    return json_loads(base64_decode(token.split(".")[1]))

def is_access_token_valid(token: str) -> bool:
    """Validates the access token by checking its expiration time."""
    if not token:
        print("Access token is empty")
        print()
        return False
    try:
        token_json = decode_access_token(token)
        token_expiration_time = datetime.datetime.fromtimestamp(token_json["exp"]) - datetime.datetime.now()
        print(f"Access Token Expires In: {token_expiration_time}")
        print()
        return token_expiration_time > datetime.timedelta(seconds=30)  # Check if token is valid for at least 30 seconds
    except Exception as e:
        print(f"Error validating access token: {e}")
        print()
        return False
    
def base64_encode(data: str) -> str:
    """Encodes a string to base64."""
    return base64.b64encode(data.encode()).decode()

def base64_decode(data: str) -> str:
    """Decodes a base64 encoded string."""
    return base64.b64decode(data.encode()).decode()
    
def json_dumps(obj) -> str:
    """Converts an object to a JSON string with indentation."""
    return json.dumps(obj, indent=2, default=str)

def json_loads(json_data: str):
    """Converts an object to a JSON string with indentation."""
    return json.loads(json_data)

def list_files(directory: str) -> dict:
    """Lists files in the specified directory."""
    directory_abs = os.path.abspath(directory)
    file_list = os.listdir(directory_abs)
    display(Markdown(f"Listing files in path: {directory_abs}\r\n- {'\r\n- '.join(file_list)}"))
    return [os.path.abspath(os.path.join(directory_abs, file_name)) for file_name in file_list]

def obj_to_html(obj, top_level: bool = True) -> str:
    """Converts an object to an HTML representation."""
    if hasattr(obj, "__dict__"):
        rows = []
        for attr_name, attr_value in vars(obj).items():
            html_value = obj_to_html(attr_value, False)  # Recursively convert nested objects to HTML
            rows.append(f"<tr><td>{attr_name}</td><td>{html_value}</td></tr>")
        body = f"<table><thead><tr><th><i>field</i></th><th>value</th></tr></thead><tbody>{''.join(rows)}</tbody></table>"
    elif isinstance(obj, list):
        rows = [f"<tr><td>{i}</td><td>{obj_to_html(item, False)}</td></tr>" for i, item in enumerate(obj)]
        body = f"<table><thead><tr><th><i>index</i></th><th>value</th></tr></thead><tbody>{''.join(rows)}</tbody></table>"
    elif isinstance(obj, dict):
        rows = [f"<tr><td>{key}</td><td>{obj_to_html(value, False)}</td></tr>" for key, value in obj.items()]
        body = f"<table><thead><tr><th><i>key</i></th><th>value</th></tr></thead><tbody>{''.join(rows)}</tbody></table>"
    else:
        body = str(obj).replace("<", "&lt;").replace(">", "&gt;").replace("\n", "<br/>")
    return body if top_level or (body and (isinstance(obj, str) or obj is None)) else f"""
        <details {'open="open"' if top_level else ''} class="dni-treeview">
            <summary>
                <span class="dni-code-hint">
                    <code>{str(obj.__class__.__name__)}</code>
                </span>
            </summary>
            <div>{body}</div>
        </details>
            """

def display_html(obj):
    """Displays an object as HTML."""
    style = """
        <style>
        .dni-code-hint {
            font-style: italic;
            overflow: hidden;
            white-space: nowrap;
        }
        .dni-treeview {
            white-space: nowrap;
        }
        .dni-treeview td {
            vertical-align: top;
            text-align: start;
        }
        details.dni-treeview {
            padding-left: 1em;
        }
        table td {
            text-align: start;
        }
        table tr { 
            vertical-align: top; 
            margin: 0em 0px;
        }
        table tr td pre 
        { 
            vertical-align: top !important; 
            margin: 0em 0px !important;
        } 
        table th {
            text-align: start;
        }
        </style>
    """
    display(HTML(obj_to_html(obj) + style))

List Files to Upload

# Expects there to be an "input" directory with files to process
input_file_names = list_files("input")

Sample Output

Listing files in path: c:\path\to\files

- test.docx

Get Access Token

access_token = refresh_access_token()
if access_token:
    token_json = decode_access_token(access_token)
    print(f"Access Token: {json_dumps(token_json)}")

Sample Output

    Getting a new access token
    Sending request to: POST https://auth.iam.experience.hyland.com/idp/connect/token
    Access Token Expires In: 0:14:58.983019
    
    Access Token:

    {
      "iss": "https://auth.iam.experience.hyland.com/idp",
      "nbf": 1755626186,
      "iat": 1755626186,
      "exp": 1755627086,
      "aud": "hxp.authorization",
      "scope": [
        "environment_authorization"
      ],
      "amr": [
        "urn:hyland:params:oauth:grant-type:api-credentials"
      ],
      "client_id": "<<REDACTED>>",
      "sub": "<<REDACTED>>",
      "auth_time": 1755626186,
      "idp": "local",
      "hxp_account": "<<REDACTED>>",
      "hxp_authorization": {
        "account_id": "<<REDACTED>>",
        "environment_id": "<<REDACTED>>",
        "subscription_id": "<<REDACTED>>",
        "appkey": "cin-data-curation-api",
        "role": [
          "cin-data-curation.user"
        ],
        "permission": [
          "cin-data-curation.api.submit"
        ]
      }
    }

Get Presign URL

from dataclasses import dataclass

if not is_access_token_valid(access_token):
    access_token = refresh_access_token()

@dataclass
class PresignResponse:
    job_id: str
    put_url: str
    get_url: str
    file_name: str

presign_responses = []
for input_file_name in input_file_names:
    presign_headers = {
        "Authorization": f"Bearer {access_token}"
    }
    presign_url = f"{dc_api_url}/presign"
    presign_response_json = send_json_post_request(url=presign_url, headers=presign_headers, data=dc_options)
    print(f"Presign Response for {input_file_name}:")
    print(json_dumps(presign_response_json))
    print()
    presign_response = PresignResponse(
        job_id=presign_response_json.get("job_id"),
        put_url=presign_response_json.get("put_url"),
        get_url=presign_response_json.get("get_url"),
        file_name=input_file_name,
    )
    if (presign_response.job_id is not None and presign_response.put_url is not None and presign_response.get_url is not None):
        presign_responses.append(presign_response)

Sample Output

    Access Token Expires In: 0:14:58.976669
    
    Sending request to: POST https://knowledge-enrichment.ai.experience.hyland.com/latest/api/data-curation/presign
    Presign Response for c:\path\to\files\test.docx:

    {
      "job_id": "API_8d5b8381-567d-4471-b0ee-619b5b7601c7",
      "put_url": "https://data-curation-api-drop-prod.s3.amazonaws.com/API_8d5b8381-567d-4471-b0ee-619b5b7601c7?Expires=1755629787",
      "get_url": "https://data-curation-api-results-prod.s3.amazonaws.com/API_8d5b8381-567d-4471-b0ee-619b5b7601c7?Expires=1755629787"
    }

Upload Files

for presign_response in presign_responses:
    if not presign_response.put_url or not presign_response.file_name:
        continue

    print(f"Uploading file: {presign_response.file_name}")
    with open(presign_response.file_name, mode="rb") as input_file:
        input_bytes = input_file.read()
    length = len(input_bytes)
    put_headers = {
        'Content-Type': 'application/octet-stream',
        'Content-Length': length
    }
    _ = send_put_request(url=presign_response.put_url, data=input_bytes, headers=put_headers)
    print(f"Upload succeeded for {presign_response.job_id}")
    print()

Sample Output

    Uploading file: c:\path\to\files\test.docx
    Sending request to: PUT https://data-curation-api-drop-prod.s3.amazonaws.com/API_8d5b8381-567d-4471-b0ee-619b5b7601c7?Expires=1755629787
    Upload succeeded for API_8d5b8381-567d-4471-b0ee-619b5b7601c7

Get Status

from dataclasses import dataclass

if not is_access_token_valid(access_token):
    access_token = refresh_access_token()

@dataclass
class StatusResponse:
    job_id: str
    status: str

for presign_response in presign_responses:
    status_headers = {
        "Authorization": f"Bearer {access_token}"
    }
    status_url = f"{dc_api_url}/status/{presign_response.job_id}"
    status_response_json = send_get_request(url=status_url, headers=status_headers)
    status_response = StatusResponse(
        job_id=status_response_json["jobId"],
        status=status_response_json["status"]
    )
    print(f"Status for {presign_response.job_id} = {status_response.status}")
    print()

Sample Output

    Access Token Expires In: 0:14:57.864776
    
    Sending request to: GET https://knowledge-enrichment.ai.experience.hyland.com/latest/api/data-curation/status/API_8d5b8381-567d-4471-b0ee-619b5b7601c7
    Status for API_8d5b8381-567d-4471-b0ee-619b5b7601c7 = Done

Download Results

output_results_json = []
for presign_response in presign_responses:
    print(f"Downloading results for: {presign_response.job_id}")
    get_response_json = send_get_request(url=presign_response.get_url)
    if get_response_json:
        output_results_json.append(get_response_json)

    print(f"Results for {presign_response.job_id}:")
    print(json_dumps(get_response_json))
    print()

Sample Output

    Downloading results for: API_8d5b8381-567d-4471-b0ee-619b5b7601c7
    Sending request to: GET https://data-curation-api-results-prod.s3.amazonaws.com/API_8d5b8381-567d-4471-b0ee-619b5b7601c7?Expires=1755629787
    Results for API_8d5b8381-567d-4471-b0ee-619b5b7601c7:

    {
      "markdown": {
        "output": "<!-- LOC: 1, (96,120,720,151) -->\n# Test Document\n\n<!-- LOC: 1, (96,164,720,189) -->\n## Header 2\n\n<!-- LOC: 1, (96,196,720,214) -->\nTest paragraph.\n\n### Header 3\n\n<!-- LOC: 1, (96,254,720,272) -->\nAnother paragraph.\n\n#### Header 4\n\n<!-- LOC: 1, (96,305,720,323) -->\nEven more paragraph.\n\n<!-- LOC: 1, (96,334,720,360) -->\n## Bullets\n\n<!-- LOC: 1, (120,366,720,429) -->\n* Point 1\n* Point 2\n* Point 3\n  <!-- LOC: 1, (168,430,720,466) -->\n  * Sub-point 3.1\n  * Sub-point 3.2\n\n<!-- LOC: 1, (96,478,720,503) -->\n## Table\n\n<!-- LOC: 1, (96,510,719,581) -->\n| <!-- LOC: 1, (103,510,297,528) -->**Name** | <!-- LOC: 1, (310,510,504,528) -->**Value** | <!-- LOC: 1, (518,510,712,528) -->**Notes**       |\n| ------------------------------------------ | ------------------------------------------- | ------------------------------------------------- |\n| <!-- LOC: 1, (103,528,297,546) -->Text     | <!-- LOC: 1, (310,528,504,546) -->ABC       | <!-- LOC: 1, (518,528,712,546) -->Text Value      |\n| <!-- LOC: 1, (103,545,297,563) -->Number   | <!-- LOC: 1, (310,545,504,563) -->123       | <!-- LOC: 1, (518,545,712,563) -->Numerical Value |\n| <!-- LOC: 1, (103,563,297,581) -->Symbol   | <!-- LOC: 1, (310,563,504,581) -->\ud83e\udd23\ud83e\udd23\ud83e\udd23    | <!-- LOC: 1, (518,563,712,581) -->Emoji symbols   |\n\n",
        "locations": [
          "<!-- LOC: 1, (96,164,720,189) -->",
          "<!-- LOC: 1, (96,196,720,214) -->",
          "<!-- LOC: 1, (96,254,720,272) -->",
          "<!-- LOC: 1, (96,305,720,323) -->",
          "<!-- LOC: 1, (120,366,720,429) -->",
          "<!-- LOC: 1, (96,510,719,581) -->"
        ],
        "chunks": [
          "# Test Document\n\n",
          "## Header 2\n\n Test paragraph.",
          "### Header 3\n\n Another paragraph.",
          "#### Header 4\n\n Even more paragraph.   \n\n",
          "## Bullets\n\n * Point 1 * Point 2 * Point 3      * Sub-point 3.1   * Sub-point 3.2 \n\n",
          "## Table\n\n \n| **Name** | **Value** | **Notes**       |\n| ------------------------------------------ | ------------------------------------------- | ------------------------------------------------- |\n| Text     | ABC       | Text Value      |\n| Number   | 123       | Numerical Value |\n| Symbol   | \ud83e\udd23\ud83e\udd23\ud83e\udd23    | Emoji symbols   |"
        ]
      }
    }

Parse Results

from dataclasses import dataclass

@dataclass
class ApiResponseChunk:
    text: str
    location: str
    embeddings: list[float]

@dataclass
class ApiResponseMarkdown:
    output: str
    chunks: list[ApiResponseChunk]

@dataclass
class ApiResponse:
    markdown: ApiResponseMarkdown
    json: dict

output_results = []
for output_result_json in output_results_json:
    chunks_json = output_result_json["markdown"].get("chunks", None)
    chunk_with_embeddings_json = output_result_json["markdown"].get("chunks_with_embeddings", None)
    locations_json = output_result_json["markdown"]["locations"]
    locations_json = [l.replace("<", "&lt;").replace(">", "&gt;") for l in locations_json]

    chunks = []
    if chunk_with_embeddings_json:
        for i in range(len(chunk_with_embeddings_json)):
            embeddings_json = chunk_with_embeddings_json[i]["embeddings"]
            chunks.append(ApiResponseChunk(
                text=chunk_with_embeddings_json[i]["chunk"],
                location=locations_json[i],
                embeddings=embeddings_json
            ))
    elif chunks_json:
        for i in range(len(chunks_json)):
            chunks.append(ApiResponseChunk(
                text=chunks_json[i],
                location=locations_json[i],
                embeddings=None
            ))

    api_response = ApiResponse(
        markdown=ApiResponseMarkdown(
            output=output_result_json["markdown"]["output"],
            chunks=chunks
        ),
        json=output_result_json.get("json", None)
    )
    output_results.append(api_response)
display_html(output_results)

index value

ApiResponse

field value

markdown

ApiResponseMarkdown

field value

output 
# Test Document


## Header 2


Test paragraph.

### Header 3


Another paragraph.

#### Header 4


Even more paragraph.


## Bullets


* Point 1
* Point 2
* Point 3

* Sub-point 3.1
* Sub-point 3.2


## Table


| Name | Value | Notes |
| ------------------------------------------ | ------------------------------------------- | ------------------------------------------------- |
| Text | ABC | Text Value |
| Number | 123 | Numerical Value |
| Symbol | 🤣🤣🤣 | Emoji symbols |

chunks

list

index value

ApiResponseChunk

field	value
text	# Test Document
location	<!-- LOC: 1, (96,164,720,189) -->
embeddings	None

ApiResponseChunk

field	value
text	## Header 2 Test paragraph.
location	<!-- LOC: 1, (96,196,720,214) -->
embeddings	None

ApiResponseChunk

field	value
text	### Header 3 Another paragraph.
location	<!-- LOC: 1, (96,254,720,272) -->
embeddings	None

ApiResponseChunk

field	value
text	#### Header 4 Even more paragraph.
location	<!-- LOC: 1, (96,305,720,323) -->
embeddings	None

ApiResponseChunk

field	value
text	## Bullets * Point 1 * Point 2 * Point 3 * Sub-point 3.1 * Sub-point 3.2
location	<!-- LOC: 1, (120,366,720,429) -->
embeddings	None

ApiResponseChunk

field	value
text	## Table \| Name \| Value \| Notes \| \| ------------------------------------------ \| ------------------------------------------- \| ------------------------------------------------- \| \| Text \| ABC \| Text Value \| \| Number \| 123 \| Numerical Value \| \| Symbol \| 🤣🤣🤣 \| Emoji symbols \|
location	<!-- LOC: 1, (96,510,719,581) -->
embeddings	None

json None

Display Markdown

from IPython.display import display, Markdown

for i in range(len(output_results)):
    print(presign_responses[i].file_name)
    display(Markdown(output_results[i].markdown.output))

c:\path\to\files\test.docx

Test Document

Header 2

Test paragraph.

Header 3

Another paragraph.

Header 4

Even more paragraph.

Bullets

Point 1
Point 2
Point 3
- Sub-point 3.1
- Sub-point 3.2

Table

Name	Value	Notes
Text	ABC	Text Value
Number	123	Numerical Value
Symbol	🤣🤣🤣	Emoji symbols

Display JSON

from IPython.display import display

for i in range(len(output_results)):
    print(presign_responses[i].file_name)
    display(output_results[i].json)

Options​

Credentials​

Helper Functions​

List Files to Upload​

Get Access Token​

Get Presign URL​

Upload Files​

Get Status​

Download Results​

Parse Results​

Display Markdown​

Test Document

Header 2​

Header 3​

Header 4​

Bullets​

Table​

Display JSON​

Options

Credentials

Helper Functions

List Files to Upload

Get Access Token

Get Presign URL

Upload Files

Get Status

Download Results

Parse Results

Display Markdown

Header 2

Header 3

Header 4

Bullets

Table

Display JSON