# Data Curation API - C# Notebook

### Options

In [None]:
string dcApiUrl = "https://knowledge-enrichment.ai.experience.hyland.com/latest/api/data-curation";
dynamic dcOptions = new {
    normalization = new {
        quotations = true, // Normalize quotation marks
        dashes = true // Normalize dashes
    },
    chunking = true, // Enables chunking of the text
    chunk_size = 1000, // Desired size of each text chunk (may be longer to prevent breaking sentences/paragraphs)
    embedding = false, // Generates text embeddings for the chunks (chunking is required)
    json_schema = false, // "FULL", "MDAST", "PIPELINE", or false
    pii = false // "redaction", "detection", or false
};

### Credentials

In [None]:
// OAuth client credentials need to be cofigured in the admin console.
// https://admin.experience.hyland.com/external-systems/external-applications
// Ensure the external application has the `environment_authorization` scope and is configured for an environment
// that has the `cin-data-curation-api` application (environment has a Data Curation API subscription).
string oauthUrl = "https://auth.iam.experience.hyland.com/idp";
string oauthClientId = "";
string oauthClientSecret = "";

### Helper Functions

In [None]:
using System.IO;
using System.Text.Json;
using System.Text.Json.Nodes;
using System.Net.Http;
using System.Net.Http.Headers;

HttpClient httpClient = new();

/// <summary>
/// Sends an HTTP request and returns the response body as a dictionary.
/// </summary>
string SendHttpRequest(HttpMethod method, string url, Dictionary<string, string> headers = null, HttpContent content = null)
{
    using HttpRequestMessage request = new HttpRequestMessage(method, url);
    if (headers != null)
    {
        foreach (KeyValuePair<string, string> header in headers)
        {
            request.Headers.Add(header.Key, header.Value);
        }
    }
    if (content != null)
    {
        request.Content = content;
    }

    Console.WriteLine($"Sending request to: {request.Method} {request.RequestUri}");
    string responseBody = null;
    try 
    {
        using HttpResponseMessage response = httpClient.Send(request);
        responseBody = response.Content.ReadAsStringAsync().Result;
    }
    catch (HttpRequestException e)
    {
        Console.WriteLine($"HTTP Error: {e.StatusCode} - {e.Message}");
        throw;
    }
    return responseBody;
}

/// <summary>
/// Sends a GET request to the specified URL with optional headers.
/// </summary>
string SendGetRequest(string url, Dictionary<string, string> headers = null)
{
    return SendHttpRequest(HttpMethod.Get, url, headers);
}

/// <summary>
/// Sends a POST request to the specified URL with optional headers and data.
/// </summary>
string SendPostRequest(string url, Dictionary<string, string> headers, HttpContent content)
{
    return SendHttpRequest(HttpMethod.Post, url, headers, content);
}

/// <summary>
/// Sends a POST request to the specified URL with optional headers and JSON data.
/// </summary>
string SendJsonPostRequest(string url, Dictionary<string, string> headers, object data)
{
    using StringContent jsonContent = new(JsonDumpString(data), Encoding.UTF8, "application/json");
    return SendPostRequest(url, headers, jsonContent);
}

/// <summary>
/// Sends a POST request to the specified URL with optional headers and form data.
/// </summary>
string SendFormPostRequest(string url, Dictionary<string, string> headers, Dictionary<string, string> data)
{
    using FormUrlEncodedContent formContent = new(data);
    return SendPostRequest(url, headers, formContent);
}

/// <summary>
/// Sends an HTTP request and returns the response body as a dictionary.
/// </summary>
string SendPutRequest(string url, Dictionary<string, string> headers, byte[] data)
{
    using ByteArrayContent binaryContent = new(data);
    binaryContent.Headers.ContentType = new MediaTypeHeaderValue("application/octet-stream");
    return SendHttpRequest(HttpMethod.Put, url, headers, binaryContent);
}

/// <summary>
/// Refreshes the access token using the OAuth client credentials.
/// </summary>
string RefreshAccessToken()
{
    Console.WriteLine("Getting a new access token");
    string accessToken = null;
    string tokenUrl = $"{oauthUrl}/connect/token";
    Dictionary<string, string> tokenHeaders = new()
    {
        { "Accept", "application/json" },
        { "Authorization", $"Basic {Base64Encode($"{oauthClientId}:{oauthClientSecret}")}" },        
    };
    Dictionary<string, string> tokenData = new()
    {
        { "grant_type", "client_credentials" },
        { "scope", "environment_authorization" },
    };
    JsonNode tokenResponse = JsonLoadString(SendFormPostRequest(tokenUrl, tokenHeaders, tokenData));
    accessToken = tokenResponse["access_token"]?.ToString();
    IsAccessTokenValid(accessToken);

    return accessToken;
}

/// <summary>
/// Decodes the access token to extract its JSON payload.
/// </summary>
JsonNode DecodeAccessToken(string token)
{
    return JsonLoadString(Base64Decode(token.Split(".")[1]));
}

/// <summary>
/// Validates the access token by checking its expiration time.
/// </summary>
bool IsAccessTokenValid(string token)
{
    if (token == null || token.Trim() == "") 
    {
        Console.WriteLine("Access token is empty");
        Console.WriteLine();
        return false;
    }

    try
    {    
        JsonNode tokenJson = DecodeAccessToken(token);
        double tokenExpiration = tokenJson["exp"]?.GetValue<double>() ?? 0;
        DateTime dateTime = new DateTime(1970, 1, 1, 0, 0, 0, 0, DateTimeKind.Utc);
        TimeSpan tokenExpirationTime = dateTime.AddSeconds(tokenExpiration) - DateTime.UtcNow;
        Console.WriteLine($"Access Token Expires In: {tokenExpirationTime}");
        Console.WriteLine();
        return tokenExpirationTime > TimeSpan.FromSeconds(30);  // Check if token is valid for at least 30 seconds
    }
    catch (Exception e)
    {
        Console.WriteLine($"Error validating access token: {e.Message}");
        Console.WriteLine();
    }
    return false;
}

/// <summary>
/// Encodes a string to base64.
/// </summary>    
string Base64Encode(string data)
{
    return Convert.ToBase64String(Encoding.UTF8.GetBytes(data));;
}

/// <summary>
/// Decodes a base64 encoded string.
/// </summary>
string Base64Decode(string data)
{
    byte[] jsonBytes;
    try 
    {
        jsonBytes = Convert.FromBase64String(data);
    }
    catch (FormatException)
    {
        jsonBytes = Convert.FromBase64String(data + "==");
    }
    return Encoding.UTF8.GetString(jsonBytes);
}

/// <summary>
/// Converts an object to a JSON string with indentation.
/// </summary>
string JsonDumpString(object obj)
{
    JsonSerializerOptions options = new() { WriteIndented = true };
    return JsonSerializer.Serialize(obj, options);
}

/// <summary>
/// Converts an object to a JSON string with indentation.
/// </summary>
JsonNode JsonLoadString(string jsonData)
{
    return JsonSerializer.Deserialize<JsonNode>(jsonData);
}

/// <summary>
/// Lists files in the specified directory.
/// </summary>
string[] ListFiles(string directory)
{
    string directoryAbs = Path.GetFullPath(directory); // Ensure the directory is absolute
    string[] fileList = Directory.EnumerateFiles(directoryAbs).ToArray();
    $"Listing files in path: {directoryAbs}\r\n- {string.Join("\r\n- ", fileList.Select(file => Path.GetFileName(file)))}".DisplayAs("text/markdown");
    return fileList;
}

### List Files to Upload

In [None]:
// Expects there to be an "input" directory with files to process
string[] inputFileNames = ListFiles("input");

### Get Access Token

In [None]:
using System.Text.Json.Nodes;

string accessToken = RefreshAccessToken();
if (accessToken != null)
{
    JsonNode tokenJson = DecodeAccessToken(accessToken);
    Console.WriteLine($"Access Token: {JsonDumpString(tokenJson)}");
}

### Get Presign URL

In [None]:
using System.Text.Json.Nodes;

if (!IsAccessTokenValid(accessToken))
{
    accessToken = RefreshAccessToken();
}

public record PresignResponse(string JobId, string PutUrl, string GetUrl, string FileName);
List<PresignResponse> presignResponses = new();
foreach (string inputFileName in inputFileNames)
{
    string presignUrl = $"{dcApiUrl}/presign";
    Dictionary<string, string> presignHeaders = new()
    {
        { "Accept", "application/json" },
        { "Authorization", $"Bearer {accessToken}" },        
    };    
    JsonNode presignResponseJson = JsonLoadString(SendJsonPostRequest(presignUrl, presignHeaders, dcOptions));
    Console.WriteLine($"Presign Response for {inputFileName}:");
    Console.WriteLine(presignResponseJson);
    Console.WriteLine();
    
    PresignResponse presignResponse = new(
        JobId: presignResponseJson["job_id"]?.ToString(),
        PutUrl: presignResponseJson["put_url"]?.ToString(),
        GetUrl: presignResponseJson["get_url"]?.ToString(),
        FileName: inputFileName
    );
    if (presignResponse.JobId != null && presignResponse.PutUrl != null && presignResponse.GetUrl != null)
    {
        presignResponses.Add(presignResponse);
    }
}

### Upload Files

In [None]:
using System.IO;

foreach (PresignResponse presignResponse in presignResponses)
{
    Console.WriteLine($"Uploading file: {presignResponse.FileName}");
    byte[] inputBytes = File.ReadAllBytes(presignResponse.FileName);
    _ = SendPutRequest(presignResponse.PutUrl, null, inputBytes);
    Console.WriteLine($"Upload succeeded for {presignResponse.JobId}");
    Console.WriteLine();
}

### Get Status

In [None]:
using System.Text.Json.Nodes;

if (!IsAccessTokenValid(accessToken))
{
    accessToken = RefreshAccessToken();
}

public record StatusResponse(string JobId, string Status);
foreach (PresignResponse presignResponse in presignResponses)
{
    string statusUrl = $"{dcApiUrl}/status/{presignResponse.JobId}";
    Dictionary<string, string> statusHeaders = new()
    {
        { "Accept", "application/json" },
        { "Authorization", $"Bearer {accessToken}" },        
    }; 
    JsonNode statusResponseJson = JsonLoadString(SendGetRequest(statusUrl, statusHeaders));
    StatusResponse statusResponse = new(
        JobId: statusResponseJson["jobId"]?.ToString(),
        Status: statusResponseJson["status"]?.ToString()
    );
    Console.WriteLine($"Status for {presignResponse.JobId} = {statusResponse.Status}");
    Console.WriteLine();
}

### Download Results

In [None]:
using System.Text.Json.Nodes;

List<JsonNode> outputResultsJson = new();
foreach (PresignResponse presignResponse in presignResponses)
{
    Console.WriteLine($"Downloading results for: {presignResponse.JobId}");
    JsonNode getResponseJson = JsonLoadString(SendGetRequest(presignResponse.GetUrl));
    Console.WriteLine($"Results for {presignResponse.JobId}:");
    Console.WriteLine($"{getResponseJson}");
    Console.WriteLine();
    if (getResponseJson != null)
    {
        outputResultsJson.Add(getResponseJson);
    }
}


### Parse Results

In [None]:
using System.Text.Json.Nodes;

public record ApiResponseChunk(string Text, string Location, double[] Embeddings);
public record ApiResponseMarkdown(string Output, ApiResponseChunk[] Chunks);
public record ApiResponse(ApiResponseMarkdown Markdown, JsonNode Json);

List<ApiResponse> outputResults = new();
foreach (JsonNode outputResultJson in outputResultsJson)
{
    JsonArray locationsJson = outputResultJson["markdown"]["locations"]?.AsArray();
    JsonArray chunksJson = outputResultJson["markdown"]["chunks"]?.AsArray();
    JsonArray chunkWithEmbeddingsJson = outputResultJson["markdown"]["chunks_with_embeddings"]?.AsArray();

    List<ApiResponseChunk> chunks = new();
    if (chunkWithEmbeddingsJson != null)
    {
        for (int i = 0; i < chunkWithEmbeddingsJson.Count; i++) 
        {
            JsonArray embeddingsJson = chunkWithEmbeddingsJson[i]["embeddings"]?.AsArray();
            double[] embeddings = embeddingsJson?.Select(e => e.GetValue<double>())?.ToArray();
            chunks.Add(new ApiResponseChunk(
                Text: chunkWithEmbeddingsJson[i]["chunk"]?.ToString(),
                Location: locationsJson[i]?.ToString(),
                Embeddings: embeddings
            ));
        }
    } 
    else if (chunksJson != null)
    {
        for (int i = 0; i < chunksJson.Count; i++) 
        {
            chunks.Add(new ApiResponseChunk(
                Text: chunksJson[i]?.ToString(),
                Location: locationsJson[i]?.ToString(),
                Embeddings: null
            ));

        }
    }

    ApiResponse apiResponse = new(
        Markdown: new(
            Output: outputResultJson["markdown"]["output"]?.ToString(),
            Chunks: chunks.ToArray()
        ),
        Json: outputResultJson["json"]
    );
    outputResults.Add(apiResponse);
}
display(outputResults);


### Display Markdown

In [None]:
for (int i=0; i < outputResults.Count; i++)
{
    Console.WriteLine(presignResponses[i].FileName);
    outputResults[i].Markdown.Output.DisplayAs("text/markdown");
}


### Display JSON

In [None]:
for (int i=0; i < outputResults.Count; i++)
{
    Console.WriteLine(presignResponses[i].FileName);
    display(JsonDumpString(JsonLoadString(outputResults[i].Json.ToString())));
}