The cjknormalization.c program builds upon basic.c and shows how you can perform normalization of Chinese, Japanese, and Korean input text before you process it in an eduction session.
#include <stdlib.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <malloc.h>
#include <edk.h>
#include <string.h>
#ifdef _WIN32
#define stat _stat
#define off_t _off_t
#endif // _WIN32
#define BUFLEN 5120
// Helper function
void displayusageinfo() {
EDK_VERSION_INFO versionInfo;
EdkGetVersion(&versionInfo);
if (versionInfo.vChangeSet)
printf("INFO: Eduction SDK Sample for SDK version v%s.%i\n", versionInfo.versionString, versionInfo.vChangeSet);
else
printf("INFO: Eduction SDK Sample for SDK version v%s\n", versionInfo.versionString);
printf("INFO: SDK Built: %s\n", versionInfo.buildTime);
printf("INFO: Copyright %s\n", versionInfo.copyright);
printf("INFO: Usage: cjknormalization.exe <grammarpath> <documentpath> <licensepath>\n");
printf("INFO: Parameters:\n");
printf("INFO: <grammarpath> Path to the grammar file to be used.\n");
printf("INFO: <documentpath> Path to the document to be parsed.\n");
printf("INFO: <licensepath> Path to the license file to be used.\n");
}
// Helper function
int fileExists(const char * const szFileName) {
struct stat buf;
int exists;
if (!szFileName)
return 0;
exists = stat(szFileName, &buf) == 0;
return exists;
}
// Helper function
int checkargs(const int argc, char **argv) {
if (argc != 4) {
printf("FAIL: Program requires four arguments.\n");
displayusageinfo();
return 0;
}
printf("INFO: Grammar Path: %s\n", argv[1]);
printf("INFO: Document Path: %s\n", argv[2]);
printf("INFO: License Path: %s\n", argv[3]);
if (!fileExists(argv[1])) {
printf("FAIL: Grammar path does not exist.\n");
return 0;
}
if (!fileExists(argv[2])) {
printf("FAIL: Document path does not exist.\n");
return 0;
}
if (!fileExists(argv[3])) {
printf("FAIL: License path does not exist.\n");
return 0;
}
return 1;
}
// Helper function
char *readFile(const char * const fn) {
struct stat fnInfo;
off_t len;
FILE *f;
char *buf;
size_t itemsRead;
if (stat(fn, &fnInfo)) {
printf("FAIL: Unable to get file size for \"%s\".\n", fn);
return NULL;
}
len = fnInfo.st_size;
if (!len) {
printf("FAIL: Zero byte file size for \"%s\".\n", fn);
return NULL;
}
f = fopen(fn, "rb");
if (!f) {
printf("FAIL: Unable to open file \"%s\".\n", fn);
return NULL;
}
buf = (char*)malloc(len+1);
itemsRead = fread(buf, 1, len, f);
*(buf+len) = '\0';
fclose(f);
if (itemsRead < (size_t)len) {
free(buf);
printf("FAIL: Unable to read \"%s\".\n", fn);
return NULL;
}
return buf;
}
// Main function
int main(int argc, char ** argv)
{
int32_t nErrCode;
EdkEngineHandle pEngine;
char *license;
const char* szErrorMsg;
EdkSessionHandle pSession;
char* buf = NULL;
off_t fileSize;
const char *szEntityName, *szEntityText, *szOrigText;
size_t textSize, textLength, origSize, origLength, origOffset, offsetLength;
double score;
const char* szCJKNormalizedText = NULL;
const char* szCJKNormalizationOptions = "HWNum,HWAlpha";
printf("INFO: Program loaded.\n");
if (!checkargs(argc, argv))
return -1;
printf("INFO: Parameters valid.\n");
if (( nErrCode = EdkEngineCreate( &pEngine )) != EdkSuccess) {
printf("Unable to create the EDK Engine. Error code: %d\n" , nErrCode);
return -1;
}
printf("INFO: Engine created.\n");
license = readFile(argv[3]);
if (!license) {
EdkEngineDestroy( pEngine );
return -1;
}
printf("INFO: License read.\n");
// Set the license to the eduction engine
if ((EdkSetLicenseKey( pEngine, license) != EdkSuccess)) {
nErrCode = EdkGetLastEngineError(pEngine, &szErrorMsg);
printf("FAIL: %s (%d)\n" , szErrorMsg, nErrCode);
free(license);
EdkEngineDestroy(pEngine);
return -1;
}
free(license);
printf("INFO: License validated.\n");
// Load resource file
// Call this function repeatedly until all required resource files are loaded
if ((EdkLoadResourceFile(pEngine, argv[1]) != EdkSuccess)) {
nErrCode = EdkGetLastEngineError(pEngine, &szErrorMsg);
printf("FAIL: %s (%d)\n" , szErrorMsg, nErrCode);
EdkEngineDestroy(pEngine);
return -1;
}
printf("INFO: Resource file loaded.\n");
// Add an entity to match to against
// Call this function repeatedly to add all desired entities
// The entities to be added must be defined in the resource files added above
if ((EdkAddTargetEntity(pEngine, "e/e") != EdkSuccess)) {
nErrCode = EdkGetLastEngineError(pEngine, &szErrorMsg);
printf("FAIL: %s (%d)\n", szErrorMsg, nErrCode);
EdkEngineDestroy(pEngine);
return -1;
}
printf("INFO: Grammar(s) initialized.\n");
// Create an eduction session assosicated with this eduction engine
// Multiple sessions can be created and concurrent processing in multithreaded applications
if (EdkSessionCreate(pEngine, &pSession) != EdkSuccess) {
nErrCode = EdkGetLastEngineError(pEngine, &szErrorMsg);
printf("FAIL: %s (%d)\n" , szErrorMsg, nErrCode);
EdkEngineDestroy(pEngine);
return -1;
}
printf("INFO: Session created.\n");
buf = readFile(argv[2]);
if (!buf) {
EdkSessionDestroy(pSession);
EdkEngineDestroy(pEngine);
return -1;
}
printf("INFO: Data file opened and %d byte block reads initiated.\n", BUFLEN);
if (EdkCJKNormalizeText(pEngine, buf, &szCJKNormalizedText, szCJKNormalizationOptions) != EdkSuccess) {
nErrCode = EdkGetLastEngineError(pEngine, &szErrorMsg);
printf("FAIL: %s (%d)\n" , szErrorMsg, nErrCode);
EdkSessionDestroy(pSession);
EdkEngineDestroy(pEngine);
return -1;
}
printf("INFO: Input buffer normalized.\n");
// Add input data
// EdkAddInputText is called repeatedly for as many times as needed until all the input has been exhausted
// The input data must be UTF-8 encoded.
// Note: An alternative method of adding input data is to create a data input stream
printf("INFO: Adding data block to engine.\n");
if ((EdkAddInputText( pSession, szCJKNormalizedText, strlen(szCJKNormalizedText), true)) != EdkSuccess) {
nErrCode = EdkGetLastSessionError(pSession, &szErrorMsg);
printf("FAIL: %s (%d)\n" , szErrorMsg, nErrCode);
EdkSessionDestroy(pSession);
EdkEngineDestroy(pEngine);
return -1;
}
printf("INFO: Data block added.\n");
// Get a match
// This is called repeatedly to get all matches
while (EdkGetNextMatch(pSession) == EdkSuccess) {
// While we have a match, obtain all required information about the match
EdkGetMatchEntityName(pSession, &szEntityName);
EdkGetMatchOrigOffset(pSession, &origOffset);
EdkGetMatchOrigOffsetLength(pSession, &offsetLength);
EdkGetMatchScore(pSession, &score);
EdkGetMatchTextSize(pSession, &textSize);
EdkGetMatchTextLength(pSession, &textLength);
EdkGetMatchOrigSize(pSession, &origSize);
EdkGetMatchOrigLength(pSession, &origLength);
EdkGetMatchOrigText(pSession, &szOrigText);
EdkGetMatchText(pSession, &szEntityText);
printf("INFO: EntityName=\"%s\" Offset=\"%u\" OffsetLength=\"%u\"\n", szEntityName, origOffset, offsetLength);
printf("INFO: Score=\"%04.2f\" NormalizedTextSize=\"%u\" NormalizedTextLength=\"%u\"\n", score, textSize, textLength);
printf("INFO: OriginalTextSize=\"%u\" OriginalTextLength=\"%u\"\n", origSize, origLength);
printf("INFO: Original Text=\"%s\"\n", szOrigText);
printf("INFO: Normalized Text=\"%s\"\n", szEntityText);
}
printf("INFO: Matching on block complete.\n");
nErrCode = EdkGetLastSessionError(pSession, &szErrorMsg);
if ((nErrCode != EdkNoMatch)) {
printf("FAIL: %s (%d)\n" , szErrorMsg, nErrCode);
EdkSessionDestroy(pSession);
EdkEngineDestroy(pEngine);
return -1;
}
// Destroy the session handle and release the resource
EdkSessionDestroy(pSession);
// Ensure that all session handles have been destroyed before calling this
EdkEngineDestroy(pEngine);
printf("PASS: Program completed without an error.\n");
return 0;
}
|
|