#include "Words.h"
#include <ctype.h>
#include <stdio.h>
#include <string.h>
enum {
FIND_WORD,
GET_WORD,
GET_FLAGS
};
#define MAXMETAPH 6
static char vsvfn[26] = { 1, 16, 4, 16, 9, 2, 4, 16, 9, 2, 0, 2, 2, 2, 1, 4, 0,
2, 4, 4, 1, 0, 0, 0, 8, 0};
static const char* gCmpKey;
static int
word_cmp(BString** firstArg, BString** secondArg)
{
return word_match(gCmpKey, (*firstArg)->String()) - word_match(gCmpKey,
(*secondArg)->String());
}
Words::Words(bool useMetaphone)
:
fUseMetaphone(useMetaphone)
{
}
Words::Words(BPositionIO* thes, bool useMetaphone)
:
WIndex(thes),
fUseMetaphone(useMetaphone)
{
}
Words::~Words(void)
{
}
Words::Words(const char* dataPath, const char* indexPath, bool useMetaphone)
:
fUseMetaphone(useMetaphone)
{
if (!useMetaphone)
fEntrySize = sizeof(uint32);
SetTo(dataPath, indexPath);
}
status_t
Words::BuildIndex(void)
{
char buffer[16384];
char *nptr, *eptr;
int64 blockOffset;
int32 blockSize;
WIndexEntry entry;
char entryName[256], *namePtr = entryName;
char suffixName[256];
char flags[32], *flagsPtr = flags;
int32 state = FIND_WORD;
fDataFile->Seek(0, SEEK_SET);
entry.offset = -1;
while (true) {
blockOffset = fDataFile->Position();
if ((blockSize = fDataFile->Read(buffer, 16384)) == 0)
break;
for (nptr = buffer, eptr = buffer + blockSize; nptr < eptr; nptr++) {
if (state == FIND_WORD) {
if (isalpha(*nptr)) {
state = GET_WORD;
*namePtr++ = *nptr;
entry.offset = blockOffset + (nptr - buffer);
} else {
entry.offset++;
}
} else if ((*nptr == '\n') || (*nptr == '\r')) {
if (namePtr != entryName) {
*namePtr = 0;
*flagsPtr = 0;
NormalizeWord(entryName, entryName);
entry.key = GetKey(entryName);
AddItem(&entry);
if (flagsPtr != flags) {
for (flagsPtr = flags; *flagsPtr != 0; flagsPtr++) {
if (suffix_word(suffixName, entryName, *flagsPtr)) {
entry.key = GetKey(suffixName);
AddItem(&entry);
}
}
}
}
state = FIND_WORD;
namePtr = entryName;
flagsPtr = flags;
} else if (state == GET_WORD) {
if (*nptr == '/') {
*namePtr = 0;
state = GET_FLAGS;
} else {
*namePtr++ = *nptr;
}
} else if (state == GET_FLAGS)
*flagsPtr++ = *nptr;
}
}
SortItems();
return B_OK;
}
int32
Words::GetKey(const char* s)
{
if (fUseMetaphone) {
char Metaph[12];
const char *sPtr;
int32 key = 0;
int32 offset;
char c;
metaphone(s, Metaph, GENERATE);
for (sPtr = Metaph, offset = 25; *sPtr; sPtr++, offset -= 5) {
c = *sPtr - 'A';
key |= int32(c) << offset;
}
for (; offset >= 0; offset -= 5)
key |= int32(31) << offset;
return key;
} else {
return WIndex::GetKey(s);
}
}
#define vowel(x) (vsvfn[(x) - 'A'] & 1)
#define same(x) (vsvfn[(x) - 'A'] & 2)
#define varson(x) (vsvfn[(x) - 'A'] & 4)
#define frontv(x) (vsvfn[(x) - 'A'] & 8)
#define noghf(x) (vsvfn[(x) - 'A'] & 16)
#define NUL '\0'
bool
metaphone(const char* Word, char* Metaph, metaphlag Flag)
{
char *n, *n_start, *n_end;
char *metaph = NULL, *metaph_end;
char ntrans[512];
char newm[MAXMETAPH + 4];
int KSflag;
for (n = ntrans + 1, n_end = ntrans + sizeof(ntrans) - 2;
*Word && n < n_end; ++Word) {
if (isalpha(*Word))
*n++ = toupper(*Word);
}
if (n == ntrans + 1)
return false;
else
n_end = n;
*n++ = NUL;
*n = NUL;
n = ntrans;
*n++ = NUL;
if (COMPARE == Flag) {
metaph = Metaph;
Metaph = newm;
}
switch (*n) {
case 'P':
case 'K':
case 'G':
if ('N' == *(n + 1))
*n++ = NUL;
break;
case 'A':
if ('E' == *(n + 1))
*n++ = NUL;
break;
case 'W':
if ('R' == *(n + 1)) {
*n++ = NUL;
} else if ('H' == *(n + 1)) {
*(n + 1) = *n;
*n++ = NUL;
}
break;
case 'X':
*n = 'S';
break;
}
KSflag = false;
for (metaph_end = Metaph + MAXMETAPH, n_start = n;
n <= n_end && Metaph < metaph_end; ++n) {
if (KSflag) {
KSflag = false;
*Metaph++ = *n;
} else {
if (*(n - 1) == *n && *n != 'C')
continue;
if (same(*n) || (n == n_start && vowel(*n))) {
*Metaph++ = *n;
} else {
switch (*n) {
case 'B':
if (n < n_end || *(n - 1) != 'M')
*Metaph++ = *n;
break;
case 'C':
if (*(n - 1) != 'S' || !frontv(*(n + 1))) {
if ('I' == *(n + 1) && 'A' == *(n + 2))
*Metaph++ = 'X';
else if (frontv(*(n + 1)))
*Metaph++ = 'S';
else if ('H' == *(n + 1)) {
*Metaph++ = ((n == n_start && !vowel(*(n + 2)))
|| 'S' == *(n - 1)) ? 'K' : 'X';
} else {
*Metaph++ = 'K';
}
}
break;
case 'D':
*Metaph++ = ('G' == *(n + 1) && frontv(*(n + 2)))
? 'J' : 'T';
break;
case 'G':
if ((*(n + 1) != 'H' || vowel(*(n + 2)))
&& (*(n + 1) != 'N' || ((n + 1) < n_end
&& (*(n + 2) != 'E' || *(n + 3) != 'D')))
&& (*(n - 1) != 'D' || !frontv(*(n + 1)))) {
*Metaph++ = (frontv(*(n + 1))
&& *(n + 2) != 'G') ? 'J' : 'K';
} else if ('H' == *(n + 1) && !noghf(*(n - 3))
&& *(n - 4) != 'H') {
*Metaph++ = 'F';
}
break;
case 'H':
if (!varson(*(n - 1))
&& (!vowel(*(n - 1)) || vowel(*(n + 1)))) {
*Metaph++ = 'H';
}
break;
case 'K':
if (*(n - 1) != 'C')
*Metaph++ = 'K';
break;
case 'P':
*Metaph++ = ('H' == *(n + 1)) ? 'F' : 'P';
break;
case 'Q':
*Metaph++ = 'K';
break;
case 'S':
*Metaph++ = ('H' == *(n + 1) || ('I' == *(n + 1)
&& ('O' == *(n + 2) || 'A' == *(n + 2))))
? 'X' : 'S';
break;
case 'T':
if ('I' == *(n + 1)
&& ('O' == *(n + 2) || 'A' == *(n + 2))) {
*Metaph++ = 'X';
} else if ('H' == *(n + 1)) {
*Metaph++ = 'O';
} else if (*(n + 1) != 'C' || *(n + 2) != 'H') {
*Metaph++ = 'T';
}
break;
case 'V':
*Metaph++ = 'F';
break;
case 'W':
case 'Y':
if (vowel(*(n + 1)))
*Metaph++ = *n;
break;
case 'X':
if (n == n_start) {
*Metaph++ = 'S';
} else {
*Metaph++ = 'K';
KSflag = true;
}
break;
case 'Z':
*Metaph++ = 'S';
break;
}
}
}
if (COMPARE == Flag
&& *(Metaph - 1) != metaph[(Metaph - newm) - 1]) {
return false;
}
}
if (COMPARE == Flag && metaph[Metaph - newm])
return false;
*Metaph = NUL;
return true;
}
int
word_match(const char* reference, const char* test)
{
const char *s1, *s2;
int32 x = 0;
char c1, c2;
s1 = test;
s2 = reference;
bool a, b;
while (*s2 || *s1) {
c1 = tolower(*s1);
c2 = tolower(*s2);
if (*s2 && *s1) {
if (c1 != c2) {
a = (tolower(s1[1]) == c2);
b = (tolower(s2[1]) == c1);
if (a && b) {
x += 1;
s1++;
s2++;
}
if (a) {
x += 1;
s1++;
}
else if (b) {
x += 1;
s2++;
}
else if (vsvfn[(unsigned)c1] == vsvfn[(unsigned)c2])
x++;
else
x += 3;
}
} else {
x += 1;
}
if (*s2)
s2++;
if (*s1)
s1++;
}
return x;
}
int32
suffix_word(char* dst, const char* src, char flag)
{
char* end;
end = stpcpy(dst, src);
flag = toupper(flag);
switch (flag) {
case 'V':
switch (end[-1]) {
case 'e':
end = stpcpy(end - 1, "ive");
break;
default:
end = stpcpy(end, "ive");
break;
}
break;
case 'N':
switch (end[-1]) {
case 'e':
end = stpcpy(end - 1, "ion");
break;
case 'y':
end = stpcpy(end - 1, "ication");
break;
default:
end = stpcpy(end, "en");
break;
}
break;
case 'X':
switch (end[-1]) {
case 'e':
end = stpcpy(end - 1, "ions");
break;
case 'y':
end = stpcpy(end - 1, "ications");
break;
default:
end = stpcpy(end, "ens");
break;
}
break;
case 'H':
switch (end[-1]) {
case 'y':
end = stpcpy(end - 1, "ieth");
break;
default:
end = stpcpy(end, "th");
break;
}
break;
case 'Y':
end = stpcpy(end, "ly");
break;
case 'G':
switch (end[-1]) {
case 'e':
end = stpcpy(end - 1, "ing");
break;
default:
end = stpcpy(end, "ing");
break;
}
break;
case 'J':
switch (end[-1]) {
case 'e':
end = stpcpy(end - 1, "ings");
break;
default:
end = stpcpy(end, "ings");
break;
}
break;
case 'D':
switch (end[-1]) {
case 'e':
end = stpcpy(end - 1, "ed");
break;
case 'y':
if (!strchr("aeiou", end[-2])) {
end = stpcpy(end - 1, "ied");
break;
}
default:
end = stpcpy(end, "ed");
break;
}
break;
case 'T':
switch (end[-1]) {
case 'e':
end = stpcpy(end - 1, "est");
break;
case 'y':
if (!strchr("aeiou", end[-2])) {
end = stpcpy(end - 1, "iest");
break;
}
default:
end = stpcpy(end, "est");
break;
}
break;
case 'R':
switch (end[-1]) {
case 'e':
end = stpcpy(end - 1, "er");
break;
case 'y':
if (!strchr("aeiou", end[-2])) {
end = stpcpy(end - 1, "ier");
break;
}
default:
end = stpcpy(end, "er");
break;
}
break;
case 'Z':
switch (end[-1]) {
case 'e':
end = stpcpy(end - 1, "ers");
break;
case 'y':
if (!strchr("aeiou", end[-2])) {
end = stpcpy(end - 1, "iers");
break;
}
default:
end = stpcpy(end, "ers");
break;
}
break;
case 'S':
switch (end[-1]) {
case 's':
case 'x':
case 'z':
case 'h':
end = stpcpy(end, "es");
break;
case 'y':
if (!strchr("aeiou", end[-2])) {
end = stpcpy(end - 1, "ies");
break;
}
default:
end = stpcpy(end, "s");
break;
}
break;
case 'P':
switch (end[-1]) {
case 'y':
if (!strchr("aeiou", end[-2])) {
end = stpcpy(end - 1, "iness");
break;
}
default:
end = stpcpy(end, "ness");
break;
}
break;
case 'M':
end = stpcpy(end, "'s");
break;
default:
return 0;
}
return end - dst;
}
int32
Words::FindBestMatches(BList* matches, const char* s)
{
int32 index;
if ((index = FindFirst(s)) >= 0) {
BString srcWord(s);
FileEntry* entry;
WIndexEntry* indexEntry;
int32 key = (ItemAt(index))->key;
int32 suffixLength;
char word[128], suffixWord[128];
const char *src, *testWord;
const char *suffixFlags;
char *dst;
gCmpKey = srcWord.String();
uint8 hashTable[32];
uint8 hashValue, highHash, lowHash;
for (int32 i = 0; i < 32; i++)
hashTable[i] = 0;
do {
indexEntry = ItemAt(index);
hashValue = indexEntry->offset % 256;
highHash = hashValue >> 3;
lowHash = 0x01 << (hashValue & 0x07);
if (!(hashTable[highHash] & lowHash)) {
hashTable[highHash] |= lowHash;
entry = GetEntry(index);
src = entry->String();
while (*src && !isalpha(*src))
src++;
dst = word;
while (*src && *src != '/')
*dst++ = *src++;
*dst = 0;
if (*src == '/')
suffixFlags = src + 1;
else
suffixFlags = src;
testWord = word;
do {
if ((GetKey(testWord) == key)
&& word_match(gCmpKey, testWord)
<= int32(float(strlen(gCmpKey)-1)*.75)) {
matches->AddItem(new BString(testWord));
}
if (*suffixFlags) {
suffixLength = 0;
while (*suffixFlags
&& !(suffixLength = suffix_word(suffixWord,
word, *suffixFlags++))) {}
if (suffixLength)
testWord = suffixWord;
else
testWord = NULL;
} else {
testWord = NULL;
}
} while (testWord);
delete entry;
}
index++;
} while (key == (ItemAt(index))->key);
return matches->CountItems();
} else {
return 0;
}
}
void
sort_word_list(BList* matches, const char* reference)
{
if (matches->CountItems() > 0) {
BString srcWord(reference);
gCmpKey = srcWord.String();
matches->SortItems((int(*)(const void*, const void*))word_cmp);
}
}