How do I extract metadata from a document?¶

Choose a language:

C#
Java
Python
C++17
C

In this Article¶

This sample demonstrates how to use the Hyland Document Filters SDK to extract metadata from a document. It provides a high-level workflow for initializing the Document Filters API, opening a document, and retrieving its metadata.

What you will learn:

API Initialization: Learn how to initialize the Hyland Document Filters API with a valid license code to enable document processing.
Document Extraction: Understand how to open a document for text extraction by using the GetExtractor method, specifying the document and the mode (BodyAndMeta) for reading the body and metadata.
Reading Metadata: Discover how to retrieve specific metadata fields, such as title, author, creation date, and other relevant information, by utilizing appropriate methods provided by the SDK.
Resource Management: Learn about proper resource management in .NET by using using statements to ensure that document and API objects are properly disposed of after use.

By working through this sample, you will become familiar with the basics of setting up the Document Filters API, handling document extraction, and iterating over the document's content efficiently.

Installing Document Filters¶

Document Filters has several NuGet packages available. For most scenarios, however, you typically only need Hyland.DocumentFilters.

You can install it using the following command:

dotnet add package Hyland.DocumentFilters

For full setup instructions see Getting Started with .NET

Instructions for accessing the Hyland.DocumentFilters Java package is available here.

Instructions for accessing the Hyland.DocumentFilters Python package is available here. It's as easy as:

pip install git+https://github.com/Hyland/DocumentFilters.git@master#subdirectory=bindings/python

Instructions for accessing the Hyland.DocumentFilters C++ 17 package is available here. It's as easy as:

CMakeLists.txt

include(FetchContent)

FetchContent_Declare(
    HylandDocumentFiltersCpp17
    GIT_REPOSITORY https://github.com/Hyland/DocumentFilters.git
    GIT_TAG master
    SOURCE_SUBDIR bindings/cpp17
)

FetchContent_MakeAvailable(HylandDocumentFiltersCpp17)

add_executable(App App.cpp)
target_link_libraries(App PRIVATE DocumentFilters::Cpp17)

Instructions for accessing the Hyland.DocumentFilters C++ 11 package is available here. It's as easy as:

CMakeLists.txt

include(FetchContent)

FetchContent_Declare(
    HylandDocumentFiltersCpp11
    GIT_REPOSITORY https://github.com/Hyland/DocumentFilters.git
    GIT_TAG master
    SOURCE_SUBDIR bindings/cpp11
)

FetchContent_MakeAvailable(HylandDocumentFiltersCpp11)

add_executable(App App.cpp)
target_link_libraries(App PRIVATE DocumentFilters::Cpp11)

Instructions for accessing the Hyland.DocumentFilters C package is available here. It's as easy as:

CMakeLists.txt

include(FetchContent)

FetchContent_Declare(
    HylandDocumentFiltersC
    GIT_REPOSITORY https://github.com/Hyland/DocumentFilters.git
    GIT_TAG master
    SOURCE_SUBDIR bindings/c
)

FetchContent_MakeAvailable(HylandDocumentFiltersC)

add_executable(App App.c)
target_link_libraries(App PRIVATE DocumentFilters::C)

Extracting Metadata¶

app.cs
using Hyland.DocumentFilters;

var api = new Hyland.DocumentFilters.Api();
api.Initialize("License Code", ".");

using var doc = api.GetExtractor("filename.doc");
doc.Open(Hyland.DocumentFilters.OpenType.MetaOnly);

while (!doc.EndOfStream)
{
    var text = doc.GetText(4096);
    Console.Out.WriteLine(text);
}

doc.Close();

See our C# samples on GitHub

App.java
import com.perceptive.documentfilters.*;

public class App
{
    public static void main(String[] args) throws Exception
    {
        DocumentFilters df = new DocumentFilters();
        df.Initialize("License Code", ".");

        try (Extractor doc = df.GetExtractor("filename.doc")) {
            doc.Open(isys_docfilters.IGR_META_ONLY);

            while (!doc.getEOF()) {
                string text = doc.GetText(4096);
                System.out.println(text);
            }
        }
    }
}

See our Java samples on GitHub

app.py
from DocumentFilters import *

api = DocumentFilters()
api.Initialize("License Code", ".")

with api.GetExtractor("filename.doc") as doc:
    doc.Open(IGR_META_ONLY, "")

    while not doc.getEOF():
        output.write(doc.GetText(MaxCharsPerGetText, stripControlCodes=True))

See our Python samples on GitHub

app.cpp
#include <DocumentFiltersObjects.h>
#include <iostream>

int main(int argc, char **argv)
{
    try {
        // Create a DocumentFilters object (Api is an alias for DocumentFilters)
        Hyland::DocFilters::Api api;

        // Initialize the DocumentFilters object with license and path
        std::string license = "License Code";
        std::string path = "."; 
        api.Initialize(license, path); 

        // Get an extractor for the specified file
        Hyland::DocFilters::Extractor doc = api.GetExtractor("filename.doc"); 

        // Open the document with BodyAndMeta flag
        doc.Open(Hyland::DocFilters::OpenMode::Text, IGR_META_ONLY); 

        // Read and print the text content
        while (!doc.getEOF()) { 
            std::wstring text = doc.getText(4096);
            std::wcout << text << std::endl;
        }

        // Close the document
        doc.Close(); 

    } catch (const std::exception& ex) {
        std::cerr << "Error: " << ex.what() << std::endl;
    }
    return 0;
}

See our C++ samples on GitHub

app.c
#include <DocumentFilters.h>
#include <stdio.h>
#include <string.h>

// License code and input file name
const char* license_code = "";
const char* input_file = "filename.doc";

// Function prototypes for UCS2 and UTF8 conversion
IGR_UCS2* UCS2(const char* src);
char* UTF8(const IGR_UCS2* src, size_t charCount);

int main() {
    // Initialization of status and control blocks
    Instance_Status_Block isb = { 0 };
    Error_Control_Block ecb = { 0 };
    IGR_SHORT instance = 0;
    IGR_LONG caps = 0, type = 0, docHandle = 0, res = 0, pageCount = 0;
    IGR_UCS2* buffer = NULL, *filename = NULL;
    char* utf8 = NULL;

    // Set license code
    strncpy(isb.Licensee_ID1, license_code, sizeof(isb.Licensee_ID1) - 1);
    // Initialize instance
    Init_Instance(0, ".", &isb, &instance, &ecb);

    // Allocate buffer for UCS2 text
    size_t bufferSize = 1024;
    buffer = malloc(bufferSize * sizeof(IGR_UCS2));
    if (!buffer)
        goto error;

    // Convert input file name to UCS2
    filename = UCS2(input_file);
    if (!filename)
        goto cleanup;

    // Open the document file
    res = IGR_Open_File(filename, IGR_META_ONLY, &caps, &type, &docHandle, &ecb);
    if (res != IGR_OK)
        goto error;

    // Read and print text from the document
    IGR_LONG returnedChars = bufferSize;
    while (IGR_Get_Text(docHandle, buffer, &returnedChars, &ecb) == 0) {
        // Convert UCS2 text to UTF8
        utf8 = UTF8(buffer, returnedChars);
        if (!utf8)
            goto error;
        // Print the UTF8 text
        printf("%s", utf8);
        // Free the UTF8 buffer
        free(utf8);
        utf8 = NULL;

        // Reset returnedChars for the next iteration
        returnedChars = bufferSize;
    }

    goto cleanup;

error:
    // Print error message
    if (ecb.Msg[0] != 0)
        fprintf(stderr, "Error: %s\n", ecb.Msg);
    else
        fprintf(stderr, "Error: %d\n", res);
cleanup:
    // Free allocated resources
    if (buffer) free(buffer);
    if (filename) free(filename);
    if (docHandle) IGR_Close_File(docHandle, &ecb);
    return 0;
}

// Convert a UTF8 string to UCS2
IGR_UCS2* UCS2(const char* src) {
    size_t len = strlen(src);
    size_t destSize = len * 2 + 2;

    IGR_UCS2* res = malloc(destSize);
    if (!res) 
        return NULL;

    // Perform the conversion
    UTF8_to_Widechar_Ex(src, len, res, destSize);
    return res;
}

// Convert a UCS2 string to UTF8
char* UTF8(const IGR_UCS2* src, size_t charCount)
{
    size_t destSize = charCount * 7;
    char* res = malloc(destSize);
    if (!res)
        return NULL;

    // Perform the conversion
    Widechar_to_UTF8_Ex(src, charCount, res, destSize);
    return res;
}