openapi: 3.0.1
info:
  title: Content Intelligence Data Curation API
  version: v1
  contact:
    name: "Data Curation API Documentation"
    url: "https://hyland.github.io/ContentIntelligence-Docs/KnowledgeEnrichment/DataCurationAPI"
servers:
  - url: https://knowledge-enrichment.ai.experience.hyland.com/latest/api/data-curation
    variables:
      Server URL:
        default: URL
        description: Data Curation API URL
paths:
  /presign:
    post:
      summary: Create Presigned URLs for S3 Upload
      operationId: Presign
      description: As the entry point for an ad hoc data curation pipeline, create a pair of presigned URLs to upload a file to an AWS S3 bucket and retrieve the results of the completed pipeline.
      responses:
        "200":
          description: Success
          content:
            text/json:
              schema:
                $ref: "#/components/schemas/PresignUrl.JsonResponse"
        "403":
          description: Forbidden
          content:
            text/json:
              schema:
                $ref: "#/components/schemas/API.Forbidden"
      security:
        - token: []
      requestBody:
        description: Optional objects can be provided at the start of a job to modify pipeline behavior. If these objects are not provided, default values are used.
        required: false
        content:
          application/json:
            schema:
              $ref: "#/components/schemas/PresignUrl.Options"
            examples:
              defaults:
                summary: All defaults (minimal request)
                value: {}
              full:
                summary: All options explicitly set
                value:
                  normalization:
                    quotations: true
                    dashes: true
                  chunking: true
                  chunking_strategy: context
                  chunk_size: 2000
                  embedding: true
                  embeddings_model: cohere.embed-multilingual-v3
                  json_schema: PIPELINE
                  pii:
                    mode: redaction
                    entity_redaction: false
              pii_detection_only:
                summary: PII detection without redaction
                value:
                  normalization:
                    quotations: true
                    dashes: true
                  chunking: true
                  chunk_size: 1500
                  embedding: true
                  json_schema: false
                  pii:
                    mode: detection
                    entity_redaction: false
              no_embeddings:
                summary: Chunking only, no embeddings
                value:
                  normalization:
                    quotations: true
                    dashes: true
                  chunking: true
                  chunking_strategy: fixed
                  chunk_size: 1000
                  embedding: false
                  json_schema: MDAST
                  pii: false
  /status/{jobId}:
    get:
      summary: Return the Status of the Specified Job
      operationId: Status
      description: Returns the status of the data curation job with the specified ID.
      parameters:
        - name: jobId
          in: path
          description: Alphanumeric ID for the data curation job.
          required: true
          schema:
            $ref: "#/components/schemas/JobId"
      responses:
        "200":
          description: Successful status retrieval; see response for status.
          content:
            text/json:
              schema:
                $ref: "#/components/schemas/Status.Response"
        "403":
          description: Forbidden
          content:
            text/json:
              schema:
                $ref: "#/components/schemas/API.Forbidden"
        "404":
          description: Not Found
          content:
            text/json:
              schema:
                $ref: "#/components/schemas/Status.NotFound"
components:
  schemas:
    API.Forbidden:
      type: object
      properties:
        Message:
          type: string
          nullable: false
          default: User is not authorized to access this resource with an explicit deny.
    JobId:
      type: string
      format: uuid
      nullable: false
    PresignUrl.JsonResponse:
      type: object
      properties:
        job_id:
          $ref: "#/components/schemas/JobId"
        put_url:
          type: string
          format: uri
          nullable: false
          default: https://data-curation-api-prod-drop.s3.amazonaws.com/ABCXYZ
        get_url:
          type: string
          format: uri
          nullable: false
          default: https://data-curation-api-prod-results.s3.amazonaws.com/ABCXYZ
    PresignUrl.Options:
      type: object
      nullable: true
      description: >
        Pipeline configuration options supplied at presign time. All fields are optional;
        omitted fields fall back to environment-level defaults. A null body is treated
        as equivalent to an empty object (all defaults apply).
      properties:
        normalization:
          type: object
          nullable: true
          description: >
            Controls Unicode normalization applied to the extracted text before further
            processing. Normalization replaces visually similar characters with their
            canonical ASCII equivalents, which improves downstream chunking, search,
            and embedding quality.
          properties:
            quotations:
              type: boolean
              nullable: true
              description: >
                Replaces "smart" (curly) quotation marks and apostrophes with their straight, ASCII equivalents (`"` and `'`).
            dashes:
              type: boolean
              nullable: true
              description: >
                Replaces en-dashes (`–`) and em-dashes (`—`) with a standard ASCII hyphen-minus (`-`). Defaults to the environment-configured value when omitted.
        chunking:
          type: boolean
          nullable: true
          description: >
            Enables or disables the chunking stage of the pipeline. When `false`, the extracted text is returned as a single block and the embedding stage is skipped regardless of the `embedding` flag.
            Defaults to the environment-configured value when omitted.
        chunking_strategy:
          type: string
          nullable: true
          enum:
            - context
            - fixed
          description: >
            Algorithm used when `chunking` is true.

            - **context** (default): Text-aware chunking that respects sentence and
              paragraph boundaries, producing semantically coherent chunks.
            - **fixed**: Fixed-size chunking that splits text into uniform-sized chunks.
              Use when consistent chunk sizes are required.
        chunk_size:
          type: integer
          nullable: true
          description: >
            Target character count for each chunk when `chunking` is true. Must be a
            positive integer no greater than the selected embedding model's maximum chunk
            size. Values outside this range or non-integer values fall back to the
            model's configured default chunk size.
        embedding:
          type: boolean
          nullable: true
          description: >
            Enables or disables the embedding stage of the pipeline. Requires `chunking`
            to also be `true`; if chunking is disabled, no embeddings are generated.
            Defaults to the environment-configured value when omitted.
        embeddings_model:
          type: string
          nullable: true
          description: >
            Identifier of the embedding model to use. Must be one of the models available in the environment's configured allow-list. When omitted, the environment's default model is used (for example, `cohere.embed-multilingual-v3`).
        json_schema:
          nullable: true
          description: >
            Controls whether a structured JSON representation of the document is
            included in the pipeline output, and which schema variant to use.
            Set to `false` (or omit the field) to exclude JSON output entirely.
          oneOf:
            - type: boolean
              enum:
                - false
            - type: string
              enum:
                - MDAST
                - FULL
                - PIPELINE
          x-enum-descriptions:
            MDAST: >
              Markdown Abstract Syntax Tree — a structured representation of the
              document's Markdown content following the MDAST specification.
            FULL: >
              Full document JSON including all extracted metadata, structural
              elements, and content.
            PIPELINE: >
              Internal pipeline representation used for debugging and integration
              testing. Includes intermediate processing artefacts.
        pii:
          nullable: true
          description: >
            Controls Personally Identifiable Information (PII) processing. Set to
            `false` (or omit the field) to skip PII processing entirely.
          oneOf:
            - type: boolean
              enum:
                - false
            - $ref: "#/components/schemas/PiiOptions"
    PiiOptions:
      type: object
      description: Configuration for PII detection and optional redaction.
      required:
        - mode
      properties:
        mode:
          type: string
          enum:
            - detection
            - redaction
          description: >
            - **detection**: Identifies and annotates PII entities in the output without
              modifying the source text.
            - **redaction**: Replaces detected PII with placeholder tokens, removing
              sensitive data from the pipeline output.
        entity_redaction:
          type: boolean
          nullable: true
          default: false
          description: >
            Controls whether named entities (such as people, organisations, or locations) are also redacted. Requires `mode` to also be `redaction`. Defaults to false when omitted.
    Status.Response:
      type: object
      properties:
        jobId:
          $ref: "#/components/schemas/JobId"
        status:
          type: string
          nullable: false
          default: Done
    Status.NotFound:
      type: object
      properties:
        message:
          type: string
          nullable: false
          default: Job not found
  securitySchemes:
    token:
      type: http
      description: JWT Bearer Token. Requires the environment_authorization scope.
      scheme: bearer
      bearerFormat: JWT
