The postprocess.c program builds upon basic.c and shows the work flow needed to support post-processing in eduction. The program:
#include <stdlib.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <malloc.h>
#include <edk.h>
#include <string.h>
#ifdef _WIN32
#define stat _stat
#define off_t _off_t
#endif // _WIN32
#define BUFLEN 5120
// Helper function
void displayusageinfo() {
EDK_VERSION_INFO versionInfo;
EdkGetVersion(&versionInfo);
if (versionInfo.vChangeSet)
printf("INFO: Eduction SDK Sample for SDK version v%s.%i\n", versionInfo.versionString, versionInfo.vChangeSet);
else
printf("INFO: Eduction SDK Sample for SDK version v%s\n", versionInfo.versionString);
printf("INFO: SDK Built: %s\n", versionInfo.buildTime);
printf("INFO: Copyright %s\n", versionInfo.copyright);
printf("INFO: Usage: sample1 <grammarpath> <entity> <documentpath> <licensepath>\n");
printf("INFO: Parameters:\n");
printf("INFO: <grammarpath> Path to the grammar file that defines matchable\n");
printf("INFO: entities. The grammar file can be in uncompiled (XML)\n");
printf("INFO: or compiled (ECR) format.\n");
printf("INFO: <entities> Comma deliminted list of entities in the grammar file\n");
printf("INFO to be used for matching.\n");
printf("INFO: <documentpath> Path to the document to be parsed.\n");
printf("INFO: <licensepath> Path to the license file to be used.\n");
}
// Helper function
int fileExists(const char * const szFileName) {
struct stat buf;
int exists;
if (!szFileName)
return 0;
exists = stat(szFileName, &buf) == 0;
return exists;
}
// Helper function
int checkargs(const int argc, char **argv) {
if (argc != 5) {
printf("FAIL: Program requires four arguments.\n");
displayusageinfo();
return 0;
}
printf("INFO: Grammar Path: %s\n", argv[1]);
printf("INFO: Entities: %s\n", argv[2]);
printf("INFO: Document Path: %s\n", argv[3]);
printf("INFO: License Path: %s\n", argv[4]);
if (!fileExists(argv[1])) {
printf("FAIL: Grammar path does not exist.\n");
return 0;
}
if (!fileExists(argv[3])) {
printf("FAIL: Document path does not exist.\n");
return 0;
}
if (!fileExists(argv[4])) {
printf("FAIL: License path does not exist.\n");
return 0;
}
return 1;
}
// Helper function
char *readFile(const char * const fn) {
struct stat fnInfo;
off_t len;
FILE *f;
char *buf;
size_t itemsRead;
if (stat(fn, &fnInfo)) {
printf("FAIL: Unable to get file size for \"%s\".\n", fn);
return NULL;
}
len = fnInfo.st_size;
if (!len) {
printf("FAIL: Zero byte file size for \"%s\".\n", fn);
return NULL;
}
f = fopen(fn, "rb");
if (!f) {
printf("FAIL: Unable to open file \"%s\".\n", fn);
return NULL;
}
buf = (char*)malloc(len+1);
itemsRead = fread(buf, 1, len, f);
*(buf+len) = '\0';
fclose(f);
if (itemsRead < (size_t)len) {
free(buf);
printf("FAIL: Unable to read \"%s\".\n", fn);
return NULL;
}
return buf;
}
// Helper function
int readFirst(const char * const fn, FILE **f, char * const buf, const size_t bufLen, off_t * const fileSize, size_t * const bytesRead) {
struct stat fnInfo;
size_t itemsToRead, itemsRead;
if (stat(fn, &fnInfo)) {
printf("FAIL: Unable to get file size for \"%s\".\n", fn);
return 0;
}
*fileSize = fnInfo.st_size;
if (!*fileSize) {
printf("FAIL: Zero byte file size for \"%s\".\n", fn);
return 0;
}
*f = fopen(fn, "rb");
if (!*f) {
printf("FAIL: Unable to open file \"%s\".\n", fn);
return 0;
}
itemsToRead = (off_t)bufLen < *fileSize ? bufLen : (size_t)*fileSize;
itemsRead = fread(buf, 1, itemsToRead, *f);
if (itemsRead < itemsToRead) {
fclose(*f);
*f = NULL;
printf("FAIL: Unable to read \"%s\".\n", fn);
return 0;
}
*bytesRead = itemsRead;
return 1;
}
// Helper function
int readNext(const char * const fn, FILE *f, char * const buf, const size_t bufLen, const size_t bytesRemaining, size_t * const bytesRead) {
size_t itemsToRead = bufLen < bytesRemaining ? bufLen : bytesRemaining;
size_t itemsRead;
if (!itemsToRead) {
*bytesRead = 0;
return 1;
}
itemsRead = fread(buf, 1, itemsToRead, f);
if (itemsRead < itemsToRead) {
printf("FAIL: Unable to continue reading \"%s\".\n", fn);
return 0;
}
*bytesRead = itemsRead;
return 1;
}
// Main function
int main(int argc, char ** argv)
{
int32_t nErrCode;
EdkEngineHandle pEngine;
char *license;
const char* szErrorMsg;
EdkSessionHandle pSession;
size_t bytesRead, bytesRemaining;
FILE *f;
char buf[BUFLEN], componentText[128];
off_t fileSize;
const char *szEntityName, *szEntityText, *szOrigText;
size_t textSize, textLength, origSize, origLength, origOffset, offsetLength;
double score;
size_t nComponents, nComponent;
EdkPostProcessTaskHandle pTask = NULL;
EdkPostProcessTasksCollectionHandle pTaskSet = NULL;
EdkPostProcessorHandle pProcessor = NULL;
EdkMatchesCollectionHandle pMatchSet = NULL;
EdkMatchHandle pMatch = NULL;
size_t nMatches = 0;
size_t nIndex = 0;
printf("INFO: Program loaded.\n");
if (!checkargs(argc, argv))
return -1;
printf("INFO: Parameters valid.\n");
if (( nErrCode = EdkEngineCreate( &pEngine )) != EdkSuccess) {
printf("Unable to create the EDK Engine. Error code: %d\n" , nErrCode);
return -1;
}
printf("INFO: Engine created.\n");
license = readFile(argv[4]);
if (!license) {
EdkEngineDestroy( pEngine );
return -1;
}
printf("INFO: License read.\n");
// Set the license to the eduction engine
if ((EdkSetLicenseKey( pEngine, license) != EdkSuccess)) {
nErrCode = EdkGetLastEngineError(pEngine, &szErrorMsg);
printf("FAIL: %s (%d)\n" , szErrorMsg, nErrCode);
free(license);
EdkEngineDestroy(pEngine);
return -1;
}
free(license);
printf("INFO: License validated.\n");
// Configure the eduction engine
// Settings include:
// EnableComponents
// EnableUniqueMatches
// MaxMatchLength
// MaxMatchesPerDoc
// MatchWholeWord
// TokenWithPunctuation
// AllowOverlaps
// AllowMultipleResults
// MatchCases
// Locale
EdkSetEnableComponents(pEngine, true);
// Load resource file
// Call this function repeatedly until all required resource files are loaded
if ((EdkLoadResourceFile(pEngine, argv[1]) != EdkSuccess)) {
nErrCode = EdkGetLastEngineError(pEngine, &szErrorMsg);
printf("FAIL: %s (%d)\n" , szErrorMsg, nErrCode);
EdkEngineDestroy(pEngine);
return -1;
}
printf("INFO: Resource file loaded.\n");
// Add an entity to match to against
// Call this function repeatedly to add all desired entities
// The entities to be added must be defined in the resource files added above
if ((EdkAddTargetEntity(pEngine, argv[2]) != EdkSuccess)) {
nErrCode = EdkGetLastEngineError(pEngine, &szErrorMsg);
printf("FAIL: %s (%d)\n", szErrorMsg, nErrCode);
EdkEngineDestroy(pEngine);
return -1;
}
printf("INFO: Grammar(s) initialized.\n");
/* Set up post processing stuff */
if ((nErrCode = EdkPostProcessorTaskCreate("test_task", "scripts/turing.lua", argv[2], false, &pTask)) != EdkSuccess)
{
printf("Unable to create post processing task. Error code: %d\n" , nErrCode);
EdkEngineDestroy(pEngine);
return -1;
}
printf("INFO: Post process task test_task, with script scripts/turing.lua, created.\n");
if ((nErrCode = EdkPostProcessorTasksCollectionCreate(&pTaskSet)) != EdkSuccess)
{
printf("Unable to create post processing task collection. Error code: %d\ n" , nErrCode);
EdkPostProcessorTaskDestroy(pTask);
EdkEngineDestroy(pEngine);
return -1;
}
if ((nErrCode = EdkPostProcessorTasksCollectionAddTask(pTaskSet, pTask)) != EdkSuccess)
{
printf("Unable to add post processing task to collection. Error code: %d\ n" , nErrCode);
EdkPostProcessorTaskDestroy(pTask);
EdkPostProcessorTasksCollectionDestroy(pTaskSet);
EdkEngineDestroy(pEngine);
return -1;
}
printf("INFO: Post process task list created.\n");
if ((nErrCode = EdkPostProcessorCreate(pTaskSet, &pProcessor)) != EdkSuccess)
{
printf("Unable to add post processing task to collection. Error code: %d\ n" , nErrCode);
EdkPostProcessorTaskDestroy(pTask);
EdkPostProcessorTasksCollectionDestroy(pTaskSet);
EdkEngineDestroy(pEngine);
return -1;
}
printf("INFO: Post processor object initialized.\n");
// Create an eduction session associated with this eduction engine
// Multiple sessions can be created and concurrent processing in multithreaded applications
if (EdkSessionCreate(pEngine, &pSession) != EdkSuccess) {
nErrCode = EdkGetLastEngineError(pEngine, &szErrorMsg);
printf("FAIL: %s (%d)\n" , szErrorMsg, nErrCode);
EdkEngineDestroy(pEngine);
return -1;
}
printf("INFO: Session created.\n");
if (!readFirst(argv[3], &f, buf, BUFLEN, &fileSize, &bytesRead)) {
EdkSessionDestroy(pSession);
EdkEngineDestroy(pEngine);
return -1;
}
bytesRemaining = (size_t)fileSize - bytesRead;
printf("INFO: Data file opened and %d byte block reads initiated.\n", BUFLEN);
// Add input data
// EdkAddInputText is called repeatedly for as many times as needed until all the input has been exhausted
// The input data must be UTF-8 encoded.
// Note: An alternative method of adding input data is to create a data input stream
while (bytesRead)
{
printf("INFO: Adding data block to engine.\n");
if ((EdkAddInputText( pSession, buf, bytesRead, bytesRemaining ? false : true)) != EdkSuccess)
{
fclose(f);
nErrCode = EdkGetLastSessionError(pSession, &szErrorMsg);
printf("FAIL: %s (%d)\n" , szErrorMsg, nErrCode);
EdkSessionDestroy(pSession);
EdkEngineDestroy(pEngine);
return -1;
}
printf("INFO: Data block added.\n");
if(!readNext(argv[3], f, buf, BUFLEN, bytesRemaining, &bytesRead))
{
fclose(f);
EdkSessionDestroy(pSession);
EdkEngineDestroy(pEngine);
return -1;
}
bytesRemaining -= bytesRead;
}
fclose(f);
/* Get all matches found from the input and process them */
if ((nErrCode = EdkFillMatches(pSession, &pMatchSet)) != EdkSuccess)
{
printf("Unable to retrieve matches from current session. Error code: %d\ n" , nErrCode);
EdkPostProcessorTaskDestroy(pTask);
EdkPostProcessorTasksCollectionDestroy(pTaskSet);
EdkEngineDestroy(pEngine);
return -1;
}
printf("INFO: Running post-processor on match set.\n");
EdkPostProcessorRun(pProcessor, pMatchSet);
EdkGetNumMatches(pMatchSet, &nMatches);
printf("INFO: Post processing complete.\n");
printf(nMatches == 1 ? "PASS: " : "FAIL: ");
printf("Got expected number of matches.\n");
for (nIndex = 0; nIndex < nMatches; nIndex++)
{
EdkRetrieveMatch(pMatchSet, nIndex, &pMatch);
/* print out match info using match info accessors */
EdkMatchGetEntityName(pMatch, &szEntityName);
EdkMatchGetMatchedTextOffset(pMatch, &origOffset);
EdkMatchGetMatchedTextOffsetLength(pMatch, &offsetLength);
EdkMatchGetScore(pMatch, &score);
EdkMatchGetNormalizedTextSize(pMatch, &textSize);
EdkMatchGetNormalizedTextLength(pMatch, &textLength);
EdkMatchGetMatchedTextSize(pMatch, &origSize);
EdkMatchGetMatchedTextLength(pMatch, &origLength);
EdkMatchGetMatchedText(pMatch, &szOrigText);
EdkMatchGetNormalizedText(pMatch, &szEntityText);
printf("INFO: EntityName=\"%s\" Offset=\"%u\" OffsetLength=\"%u\"\n", szEntityName, origOffset, offsetLength);
printf("INFO: Score=\"%04.2f\" NormalizedTextSize=\"%u\" NormalizedTextLength=\"%u\"\n", score, textSize, textLength);
printf("INFO: OriginalTextSize=\"%u\" OriginalTextLength=\"%u\"\n", origSize, origLength);
printf("INFO: Original Text=\"%s\"\n", szOrigText);
printf("INFO: Normalized Text=\"%s\"\n", szEntityText);
EdkMatchGetComponentCount(pMatch, &nComponents);
for (nComponent = 0; nComponent < nComponents; ++nComponent)
{
EdkMatchComponentHandle pComponent = NULL;
EdkMatchGetComponentHandle(pMatch, nComponent, &pComponent);
EdkMatchComponentGetName(pComponent, &szEntityName);
EdkMatchComponentGetMatchedTextOffset(pComponent, &origOffset);
EdkMatchComponentGetMatchedTextLength(pComponent, &offsetLength);
EdkMatchComponentGetSize(pComponent, &origSize);
EdkMatchComponentGetLength(pMatch, pComponent, &origLength);
strncpy(componentText, szEntityText + origOffset, origSize);
*(componentText + origSize) = '\0';
printf( "INFO: Component Name=\"%s\" Text=\"%s\"\n", szEntityName, componentText);
printf( "INFO: Offset=\"%u\" OffsetLength=\"%u\" TextSize=\"%u\" TextLength=\"%u\"\n", origOffset, offsetLength, origSize, origLength);
}
printf("INFO: Post processing complete.\n");
printf(score == 5.00 ? "PASS: " : "FAIL: ");
printf("Got expected score for match.\n");
printf(!strcmp("Alan Turing", szOrigText) ? "PASS: " : "FAIL: ");
printf("Got expected text for match.\n");
}
// Destroy the post-processing things, plus the match set
EdkPostProcessorTaskDestroy(pTask);
EdkPostProcessorTasksCollectionDestroy(pTaskSet);
EdkPostProcessorDestroy(pProcessor);
EdkDestroyMatches(pMatchSet);
// Destroy the session handle and release the resource
EdkSessionDestroy(pSession);
// Ensure that all session handles have been destroyed before calling this
EdkEngineDestroy(pEngine);
printf("PASS: Program completed without an error.\n");
return 0;
}
|
|