In this Article
This sample demonstrates how to use the Hyland Document Filters SDK to extract metadata from a document. It provides a high-level workflow for initializing the Document Filters API, opening a document, and retrieving its metadata.
What you will learn:
- API Initialization: Learn how to initialize the Hyland Document Filters API with a valid license code to enable document processing.
- Document Extraction: Understand how to open a document for text extraction by using the GetExtractor method, specifying the document and the mode (BodyAndMeta) for reading the body and metadata.
- Reading Metadata: Discover how to retrieve specific metadata fields, such as title, author, creation date, and other relevant information, by utilizing appropriate methods provided by the SDK.
- Resource Management: Learn about proper resource management in .NET by using using statements to ensure that document and API objects are properly disposed of after use.
By working through this sample, you will become familiar with the basics of setting up the Document Filters API, handling document extraction, and iterating over the document's content efficiently.
Installing Document Filters
Document Filters has several NuGet packages available. For most scenarios, however, you typically only need Hyland.DocumentFilters.
You can install it using the following command:
dotnet add package Hyland.DocumentFilters
For full setup instructions see Getting Started with .NET
Instructions for accessing the Hyland.DocumentFilters Java package is available here.
Instructions for accessing the Hyland.DocumentFilters Python package is available here. It's as easy as:
pip install git+https://github.com/Hyland/DocumentFilters.git@master#subdirectory=bindings/python
Instructions for accessing the Hyland.DocumentFilters C++ 17 package is available here. It's as easy as:
CMakeLists.txtinclude(FetchContent)
FetchContent_Declare(
HylandDocumentFiltersCpp17
GIT_REPOSITORY https://github.com/Hyland/DocumentFilters.git
GIT_TAG master
SOURCE_SUBDIR bindings/cpp17
)
FetchContent_MakeAvailable(HylandDocumentFiltersCpp17)
add_executable(App App.cpp)
target_link_libraries(App PRIVATE DocumentFilters::Cpp17)
Instructions for accessing the Hyland.DocumentFilters C++ 11 package is available here. It's as easy as:
CMakeLists.txtinclude(FetchContent)
FetchContent_Declare(
HylandDocumentFiltersCpp11
GIT_REPOSITORY https://github.com/Hyland/DocumentFilters.git
GIT_TAG master
SOURCE_SUBDIR bindings/cpp11
)
FetchContent_MakeAvailable(HylandDocumentFiltersCpp11)
add_executable(App App.cpp)
target_link_libraries(App PRIVATE DocumentFilters::Cpp11)
Instructions for accessing the Hyland.DocumentFilters C package is available here. It's as easy as:
CMakeLists.txtinclude(FetchContent)
FetchContent_Declare(
HylandDocumentFiltersC
GIT_REPOSITORY https://github.com/Hyland/DocumentFilters.git
GIT_TAG master
SOURCE_SUBDIR bindings/c
)
FetchContent_MakeAvailable(HylandDocumentFiltersC)
add_executable(App App.c)
target_link_libraries(App PRIVATE DocumentFilters::C)
app.cs |
---|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 | using Hyland.DocumentFilters;
var api = new Hyland.DocumentFilters.Api();
api.Initialize("License Code", ".");
using var doc = api.GetExtractor("filename.doc");
doc.Open(Hyland.DocumentFilters.OpenType.MetaOnly);
while (!doc.EndOfStream)
{
var text = doc.GetText(4096);
Console.Out.WriteLine(text);
}
doc.Close();
|
See our C# samples on GitHub
App.java |
---|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 | import com.perceptive.documentfilters.*;
public class App
{
public static void main(String[] args) throws Exception
{
DocumentFilters df = new DocumentFilters();
df.Initialize("License Code", ".");
try (Extractor doc = df.GetExtractor("filename.doc")) {
doc.Open(isys_docfilters.IGR_META_ONLY);
while (!doc.getEOF()) {
string text = doc.GetText(4096);
System.out.println(text);
}
}
}
}
|
See our Java samples on GitHub
app.py |
---|
| from DocumentFilters import *
api = DocumentFilters()
api.Initialize("License Code", ".")
with api.GetExtractor("filename.doc") as doc:
doc.Open(IGR_META_ONLY, "")
while not doc.getEOF():
output.write(doc.GetText(MaxCharsPerGetText, stripControlCodes=True))
|
See our Python samples on GitHub
app.cpp |
---|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34 | #include <DocumentFiltersObjects.h>
#include <iostream>
int main(int argc, char **argv)
{
try {
// Create a DocumentFilters object (Api is an alias for DocumentFilters)
Hyland::DocFilters::Api api;
// Initialize the DocumentFilters object with license and path
std::string license = "License Code";
std::string path = ".";
api.Initialize(license, path);
// Get an extractor for the specified file
Hyland::DocFilters::Extractor doc = api.GetExtractor("filename.doc");
// Open the document with BodyAndMeta flag
doc.Open(Hyland::DocFilters::OpenMode::Text, IGR_META_ONLY);
// Read and print the text content
while (!doc.getEOF()) {
std::wstring text = doc.getText(4096);
std::wcout << text << std::endl;
}
// Close the document
doc.Close();
} catch (const std::exception& ex) {
std::cerr << "Error: " << ex.what() << std::endl;
}
return 0;
}
|
See our C++ samples on GitHub
app.c |
---|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101 | #include <DocumentFilters.h>
#include <stdio.h>
#include <string.h>
// License code and input file name
const char* license_code = "";
const char* input_file = "filename.doc";
// Function prototypes for UCS2 and UTF8 conversion
IGR_UCS2* UCS2(const char* src);
char* UTF8(const IGR_UCS2* src, size_t charCount);
int main() {
// Initialization of status and control blocks
Instance_Status_Block isb = { 0 };
Error_Control_Block ecb = { 0 };
IGR_SHORT instance = 0;
IGR_LONG caps = 0, type = 0, docHandle = 0, res = 0, pageCount = 0;
IGR_UCS2* buffer = NULL, *filename = NULL;
char* utf8 = NULL;
// Set license code
strncpy(isb.Licensee_ID1, license_code, sizeof(isb.Licensee_ID1) - 1);
// Initialize instance
Init_Instance(0, ".", &isb, &instance, &ecb);
// Allocate buffer for UCS2 text
size_t bufferSize = 1024;
buffer = malloc(bufferSize * sizeof(IGR_UCS2));
if (!buffer)
goto error;
// Convert input file name to UCS2
filename = UCS2(input_file);
if (!filename)
goto cleanup;
// Open the document file
res = IGR_Open_File(filename, IGR_META_ONLY, &caps, &type, &docHandle, &ecb);
if (res != IGR_OK)
goto error;
// Read and print text from the document
IGR_LONG returnedChars = bufferSize;
while (IGR_Get_Text(docHandle, buffer, &returnedChars, &ecb) == 0) {
// Convert UCS2 text to UTF8
utf8 = UTF8(buffer, returnedChars);
if (!utf8)
goto error;
// Print the UTF8 text
printf("%s", utf8);
// Free the UTF8 buffer
free(utf8);
utf8 = NULL;
// Reset returnedChars for the next iteration
returnedChars = bufferSize;
}
goto cleanup;
error:
// Print error message
if (ecb.Msg[0] != 0)
fprintf(stderr, "Error: %s\n", ecb.Msg);
else
fprintf(stderr, "Error: %d\n", res);
cleanup:
// Free allocated resources
if (buffer) free(buffer);
if (filename) free(filename);
if (docHandle) IGR_Close_File(docHandle, &ecb);
return 0;
}
// Convert a UTF8 string to UCS2
IGR_UCS2* UCS2(const char* src) {
size_t len = strlen(src);
size_t destSize = len * 2 + 2;
IGR_UCS2* res = malloc(destSize);
if (!res)
return NULL;
// Perform the conversion
UTF8_to_Widechar_Ex(src, len, res, destSize);
return res;
}
// Convert a UCS2 string to UTF8
char* UTF8(const IGR_UCS2* src, size_t charCount)
{
size_t destSize = charCount * 7;
char* res = malloc(destSize);
if (!res)
return NULL;
// Perform the conversion
Widechar_to_UTF8_Ex(src, charCount, res, destSize);
return res;
}
|