Skip to main content

Python Sample Notebook

Download the Python Sample Notebook

Options

dc_api_url = "https://knowledge-enrichment.ai.experience.hyland.com/latest/api/data-curation"
dc_options = {
"normalization": {
"quotations": True, # Normalize quotation marks
"dashes": True # Normalize dashes
},
"chunking": True, # Enables chunking of the text
"chunk_size": 1000, # Desired size of each text chunk (may be longer to prevent breaking sentences/paragraphs)
"embedding": False, # Generates text embeddings for the chunks (chunking is required)
"json_schema": False # "FULL", "MDAST", "PIPELINE", or False
"pii" = False # "redaction", "detection", or false
}

Credentials

# OAuth client credentials need to be cofigured in the admin console.
# https://admin.experience.hyland.com/external-systems/external-applications
# Ensure the external application has the `environment_authorization` scope and is configured for an environment
# that has the `cin-data-curation-api` application (environment has a Data Curation API subscription).
oauth_url = "https://auth.iam.experience.hyland.com/idp"
oauth_client_id = "<<REDACTED>>"
oauth_client_secret = "<<REDACTED>>"

Helper Functions

import base64
import datetime
from IPython.display import display, HTML, Markdown
import json
import os
from urllib.error import HTTPError
from urllib.parse import urlencode
from urllib.request import Request, urlopen

def send_http_request(method: str, url: str, headers: dict = {}, data: bytes | None = None) -> dict:
"""Sends an HTTP request and returns the response body as a dictionary."""
request = Request(url=url, data=data, headers=headers, method=method)
print(f"Sending request to: {request.method} {request.full_url}")
response_body = {}
try:
response = urlopen(request)
response_bytes = response.read()
if len(response_bytes) > 0:
response_body = json.loads(response_bytes.decode())
except HTTPError as e:
display(f"HTTP Error: {e.code} - {e.read().decode()}")
if e.code != 404:
raise
finally:
if "response" in locals():
response.close()

return response_body

def send_get_request(url: str, headers: dict = {}) -> dict:
"""Sends a GET request to the specified URL with optional headers."""
return send_http_request(method="GET", url=url, headers=headers)

def send_post_request(url: str, headers: dict = {}, data: bytes | None = None) -> dict:
"""Sends a POST request to the specified URL with optional headers and data."""
return send_http_request(method="POST", url=url, headers=headers, data=data)

def send_json_post_request(url: str, headers: dict = {}, data: dict | None = None) -> dict:
"""Sends a POST request to the specified URL with optional headers and JSON data."""
headers["Content-Type"] = "application/json"
json_data = json.dumps(data).encode()
return send_post_request(url=url, data=json_data, headers=headers)

def send_form_post_request(url: str, headers: dict = {}, data: dict | None = None) -> dict:
"""Sends a POST request to the specified URL with optional headers and form data."""
headers["Content-Type"] = "application/x-www-form-urlencoded"
form_data = urlencode(data, doseq=True).encode()
return send_post_request(url=url, data=form_data, headers=headers)

def send_put_request(url: str, headers: dict = {}, data: bytes | None = None) -> dict:
"""Sends a PUT request to the specified URL with optional headers and data."""
return send_http_request(method="PUT", url=url, headers=headers, data=data)

def refresh_access_token() -> str | None:
"""Refreshes the access token using the OAuth client credentials."""
print("Getting a new access token")
access_token = None
headers = {
"Accept": "application/json",
"Authorization": f"Basic {base64_encode(f"{oauth_client_id}:{oauth_client_secret}")}"
}
body = {
"grant_type": "client_credentials",
"scope": "environment_authorization"
}
token_url = f"{oauth_url}/connect/token"
token_response = send_form_post_request(url=token_url, data=body, headers=headers)
access_token = token_response.get("access_token")
is_access_token_valid(access_token)

return access_token

def decode_access_token(token: str) -> dict | None:
"""Decodes the access token to extract its JSON payload."""
return json_loads(base64_decode(token.split(".")[1]))

def is_access_token_valid(token: str) -> bool:
"""Validates the access token by checking its expiration time."""
if not token:
print("Access token is empty")
print()
return False
try:
token_json = decode_access_token(token)
token_expiration_time = datetime.datetime.fromtimestamp(token_json["exp"]) - datetime.datetime.now()
print(f"Access Token Expires In: {token_expiration_time}")
print()
return token_expiration_time > datetime.timedelta(seconds=30) # Check if token is valid for at least 30 seconds
except Exception as e:
print(f"Error validating access token: {e}")
print()
return False

def base64_encode(data: str) -> str:
"""Encodes a string to base64."""
return base64.b64encode(data.encode()).decode()

def base64_decode(data: str) -> str:
"""Decodes a base64 encoded string."""
return base64.b64decode(data.encode()).decode()

def json_dumps(obj) -> str:
"""Converts an object to a JSON string with indentation."""
return json.dumps(obj, indent=2, default=str)

def json_loads(json_data: str):
"""Converts an object to a JSON string with indentation."""
return json.loads(json_data)

def list_files(directory: str) -> dict:
"""Lists files in the specified directory."""
directory_abs = os.path.abspath(directory)
file_list = os.listdir(directory_abs)
display(Markdown(f"Listing files in path: {directory_abs}\r\n- {'\r\n- '.join(file_list)}"))
return [os.path.abspath(os.path.join(directory_abs, file_name)) for file_name in file_list]

def obj_to_html(obj, top_level: bool = True) -> str:
"""Converts an object to an HTML representation."""
if hasattr(obj, "__dict__"):
rows = []
for attr_name, attr_value in vars(obj).items():
html_value = obj_to_html(attr_value, False) # Recursively convert nested objects to HTML
rows.append(f"<tr><td>{attr_name}</td><td>{html_value}</td></tr>")
body = f"<table><thead><tr><th><i>field</i></th><th>value</th></tr></thead><tbody>{''.join(rows)}</tbody></table>"
elif isinstance(obj, list):
rows = [f"<tr><td>{i}</td><td>{obj_to_html(item, False)}</td></tr>" for i, item in enumerate(obj)]
body = f"<table><thead><tr><th><i>index</i></th><th>value</th></tr></thead><tbody>{''.join(rows)}</tbody></table>"
elif isinstance(obj, dict):
rows = [f"<tr><td>{key}</td><td>{obj_to_html(value, False)}</td></tr>" for key, value in obj.items()]
body = f"<table><thead><tr><th><i>key</i></th><th>value</th></tr></thead><tbody>{''.join(rows)}</tbody></table>"
else:
body = str(obj).replace("<", "&lt;").replace(">", "&gt;").replace("\n", "<br/>")
return body if top_level or (body and (isinstance(obj, str) or obj is None)) else f"""
<details {'open="open"' if top_level else ''} class="dni-treeview">
<summary>
<span class="dni-code-hint">
<code>{str(obj.__class__.__name__)}</code>
</span>
</summary>
<div>{body}</div>
</details>
"""

def display_html(obj):
"""Displays an object as HTML."""
style = """
<style>
.dni-code-hint {
font-style: italic;
overflow: hidden;
white-space: nowrap;
}
.dni-treeview {
white-space: nowrap;
}
.dni-treeview td {
vertical-align: top;
text-align: start;
}
details.dni-treeview {
padding-left: 1em;
}
table td {
text-align: start;
}
table tr {
vertical-align: top;
margin: 0em 0px;
}
table tr td pre
{
vertical-align: top !important;
margin: 0em 0px !important;
}
table th {
text-align: start;
}
</style>
"""
display(HTML(obj_to_html(obj) + style))

List Files to Upload

# Expects there to be an "input" directory with files to process
input_file_names = list_files("input")
Sample Output
Listing files in path: c:\path\to\files

- test.docx

Get Access Token

access_token = refresh_access_token()
if access_token:
token_json = decode_access_token(access_token)
print(f"Access Token: {json_dumps(token_json)}")
Sample Output
    Getting a new access token
Sending request to: POST https://auth.iam.experience.hyland.com/idp/connect/token
Access Token Expires In: 0:14:58.983019

Access Token:
    {
"iss": "https://auth.iam.experience.hyland.com/idp",
"nbf": 1755626186,
"iat": 1755626186,
"exp": 1755627086,
"aud": "hxp.authorization",
"scope": [
"environment_authorization"
],
"amr": [
"urn:hyland:params:oauth:grant-type:api-credentials"
],
"client_id": "<<REDACTED>>",
"sub": "<<REDACTED>>",
"auth_time": 1755626186,
"idp": "local",
"hxp_account": "<<REDACTED>>",
"hxp_authorization": {
"account_id": "<<REDACTED>>",
"environment_id": "<<REDACTED>>",
"subscription_id": "<<REDACTED>>",
"appkey": "cin-data-curation-api",
"role": [
"cin-data-curation.user"
],
"permission": [
"cin-data-curation.api.submit"
]
}
}

Get Presign URL

from dataclasses import dataclass

if not is_access_token_valid(access_token):
access_token = refresh_access_token()

@dataclass
class PresignResponse:
job_id: str
put_url: str
get_url: str
file_name: str

presign_responses = []
for input_file_name in input_file_names:
presign_headers = {
"Authorization": f"Bearer {access_token}"
}
presign_url = f"{dc_api_url}/presign"
presign_response_json = send_json_post_request(url=presign_url, headers=presign_headers, data=dc_options)
print(f"Presign Response for {input_file_name}:")
print(json_dumps(presign_response_json))
print()
presign_response = PresignResponse(
job_id=presign_response_json.get("job_id"),
put_url=presign_response_json.get("put_url"),
get_url=presign_response_json.get("get_url"),
file_name=input_file_name,
)
if (presign_response.job_id is not None and presign_response.put_url is not None and presign_response.get_url is not None):
presign_responses.append(presign_response)

Sample Output
    Access Token Expires In: 0:14:58.976669

Sending request to: POST https://knowledge-enrichment.ai.experience.hyland.com/latest/api/data-curation/presign
Presign Response for c:\path\to\files\test.docx:
    {
"job_id": "API_8d5b8381-567d-4471-b0ee-619b5b7601c7",
"put_url": "https://data-curation-api-drop-prod.s3.amazonaws.com/API_8d5b8381-567d-4471-b0ee-619b5b7601c7?Expires=1755629787",
"get_url": "https://data-curation-api-results-prod.s3.amazonaws.com/API_8d5b8381-567d-4471-b0ee-619b5b7601c7?Expires=1755629787"
}

Upload Files

for presign_response in presign_responses:
if not presign_response.put_url or not presign_response.file_name:
continue

print(f"Uploading file: {presign_response.file_name}")
with open(presign_response.file_name, mode="rb") as input_file:
input_bytes = input_file.read()
length = len(input_bytes)
put_headers = {
'Content-Type': 'application/octet-stream',
'Content-Length': length
}
_ = send_put_request(url=presign_response.put_url, data=input_bytes, headers=put_headers)
print(f"Upload succeeded for {presign_response.job_id}")
print()
Sample Output
    Uploading file: c:\path\to\files\test.docx
Sending request to: PUT https://data-curation-api-drop-prod.s3.amazonaws.com/API_8d5b8381-567d-4471-b0ee-619b5b7601c7?Expires=1755629787
Upload succeeded for API_8d5b8381-567d-4471-b0ee-619b5b7601c7

Get Status

from dataclasses import dataclass

if not is_access_token_valid(access_token):
access_token = refresh_access_token()

@dataclass
class StatusResponse:
job_id: str
status: str

for presign_response in presign_responses:
status_headers = {
"Authorization": f"Bearer {access_token}"
}
status_url = f"{dc_api_url}/status/{presign_response.job_id}"
status_response_json = send_get_request(url=status_url, headers=status_headers)
status_response = StatusResponse(
job_id=status_response_json["jobId"],
status=status_response_json["status"]
)
print(f"Status for {presign_response.job_id} = {status_response.status}")
print()
Sample Output
    Access Token Expires In: 0:14:57.864776

Sending request to: GET https://knowledge-enrichment.ai.experience.hyland.com/latest/api/data-curation/status/API_8d5b8381-567d-4471-b0ee-619b5b7601c7
Status for API_8d5b8381-567d-4471-b0ee-619b5b7601c7 = Done

Download Results

output_results_json = []
for presign_response in presign_responses:
print(f"Downloading results for: {presign_response.job_id}")
get_response_json = send_get_request(url=presign_response.get_url)
if get_response_json:
output_results_json.append(get_response_json)

print(f"Results for {presign_response.job_id}:")
print(json_dumps(get_response_json))
print()
Sample Output
    Downloading results for: API_8d5b8381-567d-4471-b0ee-619b5b7601c7
Sending request to: GET https://data-curation-api-results-prod.s3.amazonaws.com/API_8d5b8381-567d-4471-b0ee-619b5b7601c7?Expires=1755629787
Results for API_8d5b8381-567d-4471-b0ee-619b5b7601c7:
    {
"markdown": {
"output": "<!-- LOC: 1, (96,120,720,151) -->\n# Test Document\n\n<!-- LOC: 1, (96,164,720,189) -->\n## Header 2\n\n<!-- LOC: 1, (96,196,720,214) -->\nTest paragraph.\n\n### Header 3\n\n<!-- LOC: 1, (96,254,720,272) -->\nAnother paragraph.\n\n#### Header 4\n\n<!-- LOC: 1, (96,305,720,323) -->\nEven more paragraph.\n\n<!-- LOC: 1, (96,334,720,360) -->\n## Bullets\n\n<!-- LOC: 1, (120,366,720,429) -->\n* Point 1\n* Point 2\n* Point 3\n <!-- LOC: 1, (168,430,720,466) -->\n * Sub-point 3.1\n * Sub-point 3.2\n\n<!-- LOC: 1, (96,478,720,503) -->\n## Table\n\n<!-- LOC: 1, (96,510,719,581) -->\n| <!-- LOC: 1, (103,510,297,528) -->**Name** | <!-- LOC: 1, (310,510,504,528) -->**Value** | <!-- LOC: 1, (518,510,712,528) -->**Notes** |\n| ------------------------------------------ | ------------------------------------------- | ------------------------------------------------- |\n| <!-- LOC: 1, (103,528,297,546) -->Text | <!-- LOC: 1, (310,528,504,546) -->ABC | <!-- LOC: 1, (518,528,712,546) -->Text Value |\n| <!-- LOC: 1, (103,545,297,563) -->Number | <!-- LOC: 1, (310,545,504,563) -->123 | <!-- LOC: 1, (518,545,712,563) -->Numerical Value |\n| <!-- LOC: 1, (103,563,297,581) -->Symbol | <!-- LOC: 1, (310,563,504,581) -->\ud83e\udd23\ud83e\udd23\ud83e\udd23 | <!-- LOC: 1, (518,563,712,581) -->Emoji symbols |\n\n",
"locations": [
"<!-- LOC: 1, (96,164,720,189) -->",
"<!-- LOC: 1, (96,196,720,214) -->",
"<!-- LOC: 1, (96,254,720,272) -->",
"<!-- LOC: 1, (96,305,720,323) -->",
"<!-- LOC: 1, (120,366,720,429) -->",
"<!-- LOC: 1, (96,510,719,581) -->"
],
"chunks": [
"# Test Document\n\n",
"## Header 2\n\n Test paragraph.",
"### Header 3\n\n Another paragraph.",
"#### Header 4\n\n Even more paragraph. \n\n",
"## Bullets\n\n * Point 1 * Point 2 * Point 3 * Sub-point 3.1 * Sub-point 3.2 \n\n",
"## Table\n\n \n| **Name** | **Value** | **Notes** |\n| ------------------------------------------ | ------------------------------------------- | ------------------------------------------------- |\n| Text | ABC | Text Value |\n| Number | 123 | Numerical Value |\n| Symbol | \ud83e\udd23\ud83e\udd23\ud83e\udd23 | Emoji symbols |"
]
}
}

Parse Results

from dataclasses import dataclass

@dataclass
class ApiResponseChunk:
text: str
location: str
embeddings: list[float]

@dataclass
class ApiResponseMarkdown:
output: str
chunks: list[ApiResponseChunk]

@dataclass
class ApiResponse:
markdown: ApiResponseMarkdown
json: dict

output_results = []
for output_result_json in output_results_json:
chunks_json = output_result_json["markdown"].get("chunks", None)
chunk_with_embeddings_json = output_result_json["markdown"].get("chunks_with_embeddings", None)
locations_json = output_result_json["markdown"]["locations"]
locations_json = [l.replace("<", "&lt;").replace(">", "&gt;") for l in locations_json]

chunks = []
if chunk_with_embeddings_json:
for i in range(len(chunk_with_embeddings_json)):
embeddings_json = chunk_with_embeddings_json[i]["embeddings"]
chunks.append(ApiResponseChunk(
text=chunk_with_embeddings_json[i]["chunk"],
location=locations_json[i],
embeddings=embeddings_json
))
elif chunks_json:
for i in range(len(chunks_json)):
chunks.append(ApiResponseChunk(
text=chunks_json[i],
location=locations_json[i],
embeddings=None
))

api_response = ApiResponse(
markdown=ApiResponseMarkdown(
output=output_result_json["markdown"]["output"],
chunks=chunks
),
json=output_result_json.get("json", None)
)
output_results.append(api_response)
display_html(output_results)
indexvalue
0
ApiResponse
fieldvalue
markdown
ApiResponseMarkdown
fieldvalue
output<!-- LOC: 1, (96,120,720,151) -->
# Test Document

<!-- LOC: 1, (96,164,720,189) -->
## Header 2

<!-- LOC: 1, (96,196,720,214) -->
Test paragraph.

### Header 3

<!-- LOC: 1, (96,254,720,272) -->
Another paragraph.

#### Header 4

<!-- LOC: 1, (96,305,720,323) -->
Even more paragraph.

<!-- LOC: 1, (96,334,720,360) -->
## Bullets

<!-- LOC: 1, (120,366,720,429) -->
* Point 1
* Point 2
* Point 3
<!-- LOC: 1, (168,430,720,466) -->
* Sub-point 3.1
* Sub-point 3.2

<!-- LOC: 1, (96,478,720,503) -->
## Table

<!-- LOC: 1, (96,510,719,581) -->
| <!-- LOC: 1, (103,510,297,528) -->Name | <!-- LOC: 1, (310,510,504,528) -->Value | <!-- LOC: 1, (518,510,712,528) -->Notes |
| ------------------------------------------ | ------------------------------------------- | ------------------------------------------------- |
| <!-- LOC: 1, (103,528,297,546) -->Text | <!-- LOC: 1, (310,528,504,546) -->ABC | <!-- LOC: 1, (518,528,712,546) -->Text Value |
| <!-- LOC: 1, (103,545,297,563) -->Number | <!-- LOC: 1, (310,545,504,563) -->123 | <!-- LOC: 1, (518,545,712,563) -->Numerical Value |
| <!-- LOC: 1, (103,563,297,581) -->Symbol | <!-- LOC: 1, (310,563,504,581) -->🤣🤣🤣 | <!-- LOC: 1, (518,563,712,581) -->Emoji symbols |

chunks
list
indexvalue
0
ApiResponseChunk
fieldvalue
text# Test Document

location<!-- LOC: 1, (96,164,720,189) -->
embeddingsNone
1
ApiResponseChunk
fieldvalue
text## Header 2

Test paragraph.
location<!-- LOC: 1, (96,196,720,214) -->
embeddingsNone
2
ApiResponseChunk
fieldvalue
text### Header 3

Another paragraph.
location<!-- LOC: 1, (96,254,720,272) -->
embeddingsNone
3
ApiResponseChunk
fieldvalue
text#### Header 4

Even more paragraph.

location<!-- LOC: 1, (96,305,720,323) -->
embeddingsNone
4
ApiResponseChunk
fieldvalue
text## Bullets

* Point 1 * Point 2 * Point 3 * Sub-point 3.1 * Sub-point 3.2

location<!-- LOC: 1, (120,366,720,429) -->
embeddingsNone
5
ApiResponseChunk
fieldvalue
text## Table


| Name | Value | Notes |
| ------------------------------------------ | ------------------------------------------- | ------------------------------------------------- |
| Text | ABC | Text Value |
| Number | 123 | Numerical Value |
| Symbol | 🤣🤣🤣 | Emoji symbols |
location<!-- LOC: 1, (96,510,719,581) -->
embeddingsNone
jsonNone

Display Markdown

from IPython.display import display, Markdown

for i in range(len(output_results)):
print(presign_responses[i].file_name)
display(Markdown(output_results[i].markdown.output))

c:\path\to\files\test.docx

Test Document

Header 2

Test paragraph.

Header 3

Another paragraph.

Header 4

Even more paragraph.

Bullets

  • Point 1
  • Point 2
  • Point 3
    • Sub-point 3.1
    • Sub-point 3.2

Table

NameValueNotes
TextABCText Value
Number123Numerical Value
Symbol🤣🤣🤣Emoji symbols

Display JSON

from IPython.display import display

for i in range(len(output_results)):
print(presign_responses[i].file_name)
display(output_results[i].json)