Python Sample Notebook
Download the Python Sample Notebook
Options
dc_api_url = "https://knowledge-enrichment.ai.experience.hyland.com/latest/api/data-curation"
dc_options = {
"normalization": {
"quotations": True, # Normalize quotation marks
"dashes": True # Normalize dashes
},
"chunking": True, # Enables chunking of the text
"chunk_size": 1000, # Desired size of each text chunk (may be longer to prevent breaking sentences/paragraphs)
"embedding": False, # Generates text embeddings for the chunks (chunking is required)
"json_schema": False # "FULL", "MDAST", "PIPELINE", or False
"pii" = False # "redaction", "detection", or false
}
Credentials
# OAuth client credentials need to be cofigured in the admin console.
# https://admin.experience.hyland.com/external-systems/external-applications
# Ensure the external application has the `environment_authorization` scope and is configured for an environment
# that has the `cin-data-curation-api` application (environment has a Data Curation API subscription).
oauth_url = "https://auth.iam.experience.hyland.com/idp"
oauth_client_id = "<<REDACTED>>"
oauth_client_secret = "<<REDACTED>>"
Helper Functions
import base64
import datetime
from IPython.display import display, HTML, Markdown
import json
import os
from urllib.error import HTTPError
from urllib.parse import urlencode
from urllib.request import Request, urlopen
def send_http_request(method: str, url: str, headers: dict = {}, data: bytes | None = None) -> dict:
"""Sends an HTTP request and returns the response body as a dictionary."""
request = Request(url=url, data=data, headers=headers, method=method)
print(f"Sending request to: {request.method} {request.full_url}")
response_body = {}
try:
response = urlopen(request)
response_bytes = response.read()
if len(response_bytes) > 0:
response_body = json.loads(response_bytes.decode())
except HTTPError as e:
display(f"HTTP Error: {e.code} - {e.read().decode()}")
if e.code != 404:
raise
finally:
if "response" in locals():
response.close()
return response_body
def send_get_request(url: str, headers: dict = {}) -> dict:
"""Sends a GET request to the specified URL with optional headers."""
return send_http_request(method="GET", url=url, headers=headers)
def send_post_request(url: str, headers: dict = {}, data: bytes | None = None) -> dict:
"""Sends a POST request to the specified URL with optional headers and data."""
return send_http_request(method="POST", url=url, headers=headers, data=data)
def send_json_post_request(url: str, headers: dict = {}, data: dict | None = None) -> dict:
"""Sends a POST request to the specified URL with optional headers and JSON data."""
headers["Content-Type"] = "application/json"
json_data = json.dumps(data).encode()
return send_post_request(url=url, data=json_data, headers=headers)
def send_form_post_request(url: str, headers: dict = {}, data: dict | None = None) -> dict:
"""Sends a POST request to the specified URL with optional headers and form data."""
headers["Content-Type"] = "application/x-www-form-urlencoded"
form_data = urlencode(data, doseq=True).encode()
return send_post_request(url=url, data=form_data, headers=headers)
def send_put_request(url: str, headers: dict = {}, data: bytes | None = None) -> dict:
"""Sends a PUT request to the specified URL with optional headers and data."""
return send_http_request(method="PUT", url=url, headers=headers, data=data)
def refresh_access_token() -> str | None:
"""Refreshes the access token using the OAuth client credentials."""
print("Getting a new access token")
access_token = None
headers = {
"Accept": "application/json",
"Authorization": f"Basic {base64_encode(f"{oauth_client_id}:{oauth_client_secret}")}"
}
body = {
"grant_type": "client_credentials",
"scope": "environment_authorization"
}
token_url = f"{oauth_url}/connect/token"
token_response = send_form_post_request(url=token_url, data=body, headers=headers)
access_token = token_response.get("access_token")
is_access_token_valid(access_token)
return access_token
def decode_access_token(token: str) -> dict | None:
"""Decodes the access token to extract its JSON payload."""
return json_loads(base64_decode(token.split(".")[1]))
def is_access_token_valid(token: str) -> bool:
"""Validates the access token by checking its expiration time."""
if not token:
print("Access token is empty")
print()
return False
try:
token_json = decode_access_token(token)
token_expiration_time = datetime.datetime.fromtimestamp(token_json["exp"]) - datetime.datetime.now()
print(f"Access Token Expires In: {token_expiration_time}")
print()
return token_expiration_time > datetime.timedelta(seconds=30) # Check if token is valid for at least 30 seconds
except Exception as e:
print(f"Error validating access token: {e}")
print()
return False
def base64_encode(data: str) -> str:
"""Encodes a string to base64."""
return base64.b64encode(data.encode()).decode()
def base64_decode(data: str) -> str:
"""Decodes a base64 encoded string."""
return base64.b64decode(data.encode()).decode()
def json_dumps(obj) -> str:
"""Converts an object to a JSON string with indentation."""
return json.dumps(obj, indent=2, default=str)
def json_loads(json_data: str):
"""Converts an object to a JSON string with indentation."""
return json.loads(json_data)
def list_files(directory: str) -> dict:
"""Lists files in the specified directory."""
directory_abs = os.path.abspath(directory)
file_list = os.listdir(directory_abs)
display(Markdown(f"Listing files in path: {directory_abs}\r\n- {'\r\n- '.join(file_list)}"))
return [os.path.abspath(os.path.join(directory_abs, file_name)) for file_name in file_list]
def obj_to_html(obj, top_level: bool = True) -> str:
"""Converts an object to an HTML representation."""
if hasattr(obj, "__dict__"):
rows = []
for attr_name, attr_value in vars(obj).items():
html_value = obj_to_html(attr_value, False) # Recursively convert nested objects to HTML
rows.append(f"<tr><td>{attr_name}</td><td>{html_value}</td></tr>")
body = f"<table><thead><tr><th><i>field</i></th><th>value</th></tr></thead><tbody>{''.join(rows)}</tbody></table>"
elif isinstance(obj, list):
rows = [f"<tr><td>{i}</td><td>{obj_to_html(item, False)}</td></tr>" for i, item in enumerate(obj)]
body = f"<table><thead><tr><th><i>index</i></th><th>value</th></tr></thead><tbody>{''.join(rows)}</tbody></table>"
elif isinstance(obj, dict):
rows = [f"<tr><td>{key}</td><td>{obj_to_html(value, False)}</td></tr>" for key, value in obj.items()]
body = f"<table><thead><tr><th><i>key</i></th><th>value</th></tr></thead><tbody>{''.join(rows)}</tbody></table>"
else:
body = str(obj).replace("<", "<").replace(">", ">").replace("\n", "<br/>")
return body if top_level or (body and (isinstance(obj, str) or obj is None)) else f"""
<details {'open="open"' if top_level else ''} class="dni-treeview">
<summary>
<span class="dni-code-hint">
<code>{str(obj.__class__.__name__)}</code>
</span>
</summary>
<div>{body}</div>
</details>
"""
def display_html(obj):
"""Displays an object as HTML."""
style = """
<style>
.dni-code-hint {
font-style: italic;
overflow: hidden;
white-space: nowrap;
}
.dni-treeview {
white-space: nowrap;
}
.dni-treeview td {
vertical-align: top;
text-align: start;
}
details.dni-treeview {
padding-left: 1em;
}
table td {
text-align: start;
}
table tr {
vertical-align: top;
margin: 0em 0px;
}
table tr td pre
{
vertical-align: top !important;
margin: 0em 0px !important;
}
table th {
text-align: start;
}
</style>
"""
display(HTML(obj_to_html(obj) + style))
List Files to Upload
# Expects there to be an "input" directory with files to process
input_file_names = list_files("input")
Sample Output
Listing files in path: c:\path\to\files
- test.docx
Get Access Token
access_token = refresh_access_token()
if access_token:
token_json = decode_access_token(access_token)
print(f"Access Token: {json_dumps(token_json)}")
Sample Output
Getting a new access token
Sending request to: POST https://auth.iam.experience.hyland.com/idp/connect/token
Access Token Expires In: 0:14:58.983019
Access Token:
{
"iss": "https://auth.iam.experience.hyland.com/idp",
"nbf": 1755626186,
"iat": 1755626186,
"exp": 1755627086,
"aud": "hxp.authorization",
"scope": [
"environment_authorization"
],
"amr": [
"urn:hyland:params:oauth:grant-type:api-credentials"
],
"client_id": "<<REDACTED>>",
"sub": "<<REDACTED>>",
"auth_time": 1755626186,
"idp": "local",
"hxp_account": "<<REDACTED>>",
"hxp_authorization": {
"account_id": "<<REDACTED>>",
"environment_id": "<<REDACTED>>",
"subscription_id": "<<REDACTED>>",
"appkey": "cin-data-curation-api",
"role": [
"cin-data-curation.user"
],
"permission": [
"cin-data-curation.api.submit"
]
}
}
Get Presign URL
from dataclasses import dataclass
if not is_access_token_valid(access_token):
access_token = refresh_access_token()
@dataclass
class PresignResponse:
job_id: str
put_url: str
get_url: str
file_name: str
presign_responses = []
for input_file_name in input_file_names:
presign_headers = {
"Authorization": f"Bearer {access_token}"
}
presign_url = f"{dc_api_url}/presign"
presign_response_json = send_json_post_request(url=presign_url, headers=presign_headers, data=dc_options)
print(f"Presign Response for {input_file_name}:")
print(json_dumps(presign_response_json))
print()
presign_response = PresignResponse(
job_id=presign_response_json.get("job_id"),
put_url=presign_response_json.get("put_url"),
get_url=presign_response_json.get("get_url"),
file_name=input_file_name,
)
if (presign_response.job_id is not None and presign_response.put_url is not None and presign_response.get_url is not None):
presign_responses.append(presign_response)
Sample Output
Access Token Expires In: 0:14:58.976669
Sending request to: POST https://knowledge-enrichment.ai.experience.hyland.com/latest/api/data-curation/presign
Presign Response for c:\path\to\files\test.docx:
{
"job_id": "API_8d5b8381-567d-4471-b0ee-619b5b7601c7",
"put_url": "https://data-curation-api-drop-prod.s3.amazonaws.com/API_8d5b8381-567d-4471-b0ee-619b5b7601c7?Expires=1755629787",
"get_url": "https://data-curation-api-results-prod.s3.amazonaws.com/API_8d5b8381-567d-4471-b0ee-619b5b7601c7?Expires=1755629787"
}
Upload Files
for presign_response in presign_responses:
if not presign_response.put_url or not presign_response.file_name:
continue
print(f"Uploading file: {presign_response.file_name}")
with open(presign_response.file_name, mode="rb") as input_file:
input_bytes = input_file.read()
length = len(input_bytes)
put_headers = {
'Content-Type': 'application/octet-stream',
'Content-Length': length
}
_ = send_put_request(url=presign_response.put_url, data=input_bytes, headers=put_headers)
print(f"Upload succeeded for {presign_response.job_id}")
print()
Sample Output
Uploading file: c:\path\to\files\test.docx
Sending request to: PUT https://data-curation-api-drop-prod.s3.amazonaws.com/API_8d5b8381-567d-4471-b0ee-619b5b7601c7?Expires=1755629787
Upload succeeded for API_8d5b8381-567d-4471-b0ee-619b5b7601c7
Get Status
from dataclasses import dataclass
if not is_access_token_valid(access_token):
access_token = refresh_access_token()
@dataclass
class StatusResponse:
job_id: str
status: str
for presign_response in presign_responses:
status_headers = {
"Authorization": f"Bearer {access_token}"
}
status_url = f"{dc_api_url}/status/{presign_response.job_id}"
status_response_json = send_get_request(url=status_url, headers=status_headers)
status_response = StatusResponse(
job_id=status_response_json["jobId"],
status=status_response_json["status"]
)
print(f"Status for {presign_response.job_id} = {status_response.status}")
print()
Sample Output
Access Token Expires In: 0:14:57.864776
Sending request to: GET https://knowledge-enrichment.ai.experience.hyland.com/latest/api/data-curation/status/API_8d5b8381-567d-4471-b0ee-619b5b7601c7
Status for API_8d5b8381-567d-4471-b0ee-619b5b7601c7 = Done
Download Results
output_results_json = []
for presign_response in presign_responses:
print(f"Downloading results for: {presign_response.job_id}")
get_response_json = send_get_request(url=presign_response.get_url)
if get_response_json:
output_results_json.append(get_response_json)
print(f"Results for {presign_response.job_id}:")
print(json_dumps(get_response_json))
print()
Sample Output
Downloading results for: API_8d5b8381-567d-4471-b0ee-619b5b7601c7
Sending request to: GET https://data-curation-api-results-prod.s3.amazonaws.com/API_8d5b8381-567d-4471-b0ee-619b5b7601c7?Expires=1755629787
Results for API_8d5b8381-567d-4471-b0ee-619b5b7601c7:
{
"markdown": {
"output": "<!-- LOC: 1, (96,120,720,151) -->\n# Test Document\n\n<!-- LOC: 1, (96,164,720,189) -->\n## Header 2\n\n<!-- LOC: 1, (96,196,720,214) -->\nTest paragraph.\n\n### Header 3\n\n<!-- LOC: 1, (96,254,720,272) -->\nAnother paragraph.\n\n#### Header 4\n\n<!-- LOC: 1, (96,305,720,323) -->\nEven more paragraph.\n\n<!-- LOC: 1, (96,334,720,360) -->\n## Bullets\n\n<!-- LOC: 1, (120,366,720,429) -->\n* Point 1\n* Point 2\n* Point 3\n <!-- LOC: 1, (168,430,720,466) -->\n * Sub-point 3.1\n * Sub-point 3.2\n\n<!-- LOC: 1, (96,478,720,503) -->\n## Table\n\n<!-- LOC: 1, (96,510,719,581) -->\n| <!-- LOC: 1, (103,510,297,528) -->**Name** | <!-- LOC: 1, (310,510,504,528) -->**Value** | <!-- LOC: 1, (518,510,712,528) -->**Notes** |\n| ------------------------------------------ | ------------------------------------------- | ------------------------------------------------- |\n| <!-- LOC: 1, (103,528,297,546) -->Text | <!-- LOC: 1, (310,528,504,546) -->ABC | <!-- LOC: 1, (518,528,712,546) -->Text Value |\n| <!-- LOC: 1, (103,545,297,563) -->Number | <!-- LOC: 1, (310,545,504,563) -->123 | <!-- LOC: 1, (518,545,712,563) -->Numerical Value |\n| <!-- LOC: 1, (103,563,297,581) -->Symbol | <!-- LOC: 1, (310,563,504,581) -->\ud83e\udd23\ud83e\udd23\ud83e\udd23 | <!-- LOC: 1, (518,563,712,581) -->Emoji symbols |\n\n",
"locations": [
"<!-- LOC: 1, (96,164,720,189) -->",
"<!-- LOC: 1, (96,196,720,214) -->",
"<!-- LOC: 1, (96,254,720,272) -->",
"<!-- LOC: 1, (96,305,720,323) -->",
"<!-- LOC: 1, (120,366,720,429) -->",
"<!-- LOC: 1, (96,510,719,581) -->"
],
"chunks": [
"# Test Document\n\n",
"## Header 2\n\n Test paragraph.",
"### Header 3\n\n Another paragraph.",
"#### Header 4\n\n Even more paragraph. \n\n",
"## Bullets\n\n * Point 1 * Point 2 * Point 3 * Sub-point 3.1 * Sub-point 3.2 \n\n",
"## Table\n\n \n| **Name** | **Value** | **Notes** |\n| ------------------------------------------ | ------------------------------------------- | ------------------------------------------------- |\n| Text | ABC | Text Value |\n| Number | 123 | Numerical Value |\n| Symbol | \ud83e\udd23\ud83e\udd23\ud83e\udd23 | Emoji symbols |"
]
}
}
Parse Results
from dataclasses import dataclass
@dataclass
class ApiResponseChunk:
text: str
location: str
embeddings: list[float]
@dataclass
class ApiResponseMarkdown:
output: str
chunks: list[ApiResponseChunk]
@dataclass
class ApiResponse:
markdown: ApiResponseMarkdown
json: dict
output_results = []
for output_result_json in output_results_json:
chunks_json = output_result_json["markdown"].get("chunks", None)
chunk_with_embeddings_json = output_result_json["markdown"].get("chunks_with_embeddings", None)
locations_json = output_result_json["markdown"]["locations"]
locations_json = [l.replace("<", "<").replace(">", ">") for l in locations_json]
chunks = []
if chunk_with_embeddings_json:
for i in range(len(chunk_with_embeddings_json)):
embeddings_json = chunk_with_embeddings_json[i]["embeddings"]
chunks.append(ApiResponseChunk(
text=chunk_with_embeddings_json[i]["chunk"],
location=locations_json[i],
embeddings=embeddings_json
))
elif chunks_json:
for i in range(len(chunks_json)):
chunks.append(ApiResponseChunk(
text=chunks_json[i],
location=locations_json[i],
embeddings=None
))
api_response = ApiResponse(
markdown=ApiResponseMarkdown(
output=output_result_json["markdown"]["output"],
chunks=chunks
),
json=output_result_json.get("json", None)
)
output_results.append(api_response)
display_html(output_results)
| index | value | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 |
|
| field | value | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| markdown |
|
| field | value | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| output | <!-- LOC: 1, (96,120,720,151) --> # Test Document <!-- LOC: 1, (96,164,720,189) --> ## Header 2 <!-- LOC: 1, (96,196,720,214) --> Test paragraph. ### Header 3 <!-- LOC: 1, (96,254,720,272) --> Another paragraph. #### Header 4 <!-- LOC: 1, (96,305,720,323) --> Even more paragraph. <!-- LOC: 1, (96,334,720,360) --> ## Bullets <!-- LOC: 1, (120,366,720,429) --> * Point 1 * Point 2 * Point 3 <!-- LOC: 1, (168,430,720,466) --> * Sub-point 3.1 * Sub-point 3.2 <!-- LOC: 1, (96,478,720,503) --> ## Table <!-- LOC: 1, (96,510,719,581) --> | <!-- LOC: 1, (103,510,297,528) -->Name | <!-- LOC: 1, (310,510,504,528) -->Value | <!-- LOC: 1, (518,510,712,528) -->Notes | | ------------------------------------------ | ------------------------------------------- | ------------------------------------------------- | | <!-- LOC: 1, (103,528,297,546) -->Text | <!-- LOC: 1, (310,528,504,546) -->ABC | <!-- LOC: 1, (518,528,712,546) -->Text Value | | <!-- LOC: 1, (103,545,297,563) -->Number | <!-- LOC: 1, (310,545,504,563) -->123 | <!-- LOC: 1, (518,545,712,563) -->Numerical Value | | <!-- LOC: 1, (103,563,297,581) -->Symbol | <!-- LOC: 1, (310,563,504,581) -->🤣🤣🤣 | <!-- LOC: 1, (518,563,712,581) -->Emoji symbols | | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| chunks |
|
| index | value | ||||||||
|---|---|---|---|---|---|---|---|---|---|
| 0 |
|
| field | value |
|---|---|
| text | # Test Document |
| location | <!-- LOC: 1, (96,164,720,189) --> |
| embeddings | None |
ApiResponseChunk
| field | value |
|---|---|
| text | ## Header 2 Test paragraph. |
| location | <!-- LOC: 1, (96,196,720,214) --> |
| embeddings | None |
ApiResponseChunk
| field | value |
|---|---|
| text | ### Header 3 Another paragraph. |
| location | <!-- LOC: 1, (96,254,720,272) --> |
| embeddings | None |
ApiResponseChunk
| field | value |
|---|---|
| text | #### Header 4 Even more paragraph. |
| location | <!-- LOC: 1, (96,305,720,323) --> |
| embeddings | None |
ApiResponseChunk
| field | value |
|---|---|
| text | ## Bullets * Point 1 * Point 2 * Point 3 * Sub-point 3.1 * Sub-point 3.2 |
| location | <!-- LOC: 1, (120,366,720,429) --> |
| embeddings | None |
ApiResponseChunk
| field | value |
|---|---|
| text | ## Table | Name | Value | Notes | | ------------------------------------------ | ------------------------------------------- | ------------------------------------------------- | | Text | ABC | Text Value | | Number | 123 | Numerical Value | | Symbol | 🤣🤣🤣 | Emoji symbols | |
| location | <!-- LOC: 1, (96,510,719,581) --> |
| embeddings | None |
Display Markdown
from IPython.display import display, Markdown
for i in range(len(output_results)):
print(presign_responses[i].file_name)
display(Markdown(output_results[i].markdown.output))
c:\path\to\files\test.docx
Test Document
Header 2
Test paragraph.
Header 3
Another paragraph.
Header 4
Even more paragraph.
Bullets
- Point 1
- Point 2
- Point 3
- Sub-point 3.1
- Sub-point 3.2
Table
| Name | Value | Notes |
|---|---|---|
| Text | ABC | Text Value |
| Number | 123 | Numerical Value |
| Symbol | 🤣🤣🤣 | Emoji symbols |
Display JSON
from IPython.display import display
for i in range(len(output_results)):
print(presign_responses[i].file_name)
display(output_results[i].json)