123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195 |
- #include <QFile>
- #include <QTextStream>
- #include <QString>
- #include <QStringList>
- #include <QtDebug>
- #include "dictengine.h"
- #include "base64.h"
- DictEngine::DictEngine(QString path, QString name, QObject *parent) :
- QObject(parent), datName(name)
- {
- QString indexFilePath = path + "/" + name + ".index";
- indexFileHandle = new QFile(indexFilePath);
- if(!indexFileHandle->open(QIODevice::ReadOnly | QIODevice::Text))
- throw QString(tr("File %1 could not be opened.").arg(indexFilePath));
- streamIndex = new QTextStream();
- streamIndex->setDevice(indexFileHandle);
- streamIndex->setCodec("UTF-8");
- indexEndPos = indexFileHandle->size() - 1;
- QString dictFilePath = path + "/" + name + ".dict";
- dictFileHandle = new QFile(dictFilePath);
- if(!dictFileHandle->open(QIODevice::ReadOnly | QIODevice::Text))
- throw QString(tr("File %1 could not be opened.").arg(dictFilePath));
- streamDict = new QTextStream();
- streamDict->setDevice(dictFileHandle);
- streamDict->setCodec("UTF-8");
- sugg = new QList<DictRecord>();
- }
- DictEngine::~DictEngine() {
- delete streamIndex;
- delete indexFileHandle;
- delete streamDict;
- delete dictFileHandle;
- delete sugg;
- }
- qint64 DictEngine::findFirstEntry(QString query) const
- {
- qint64 low = 0, high = indexEndPos, pos;
- query = locToASCII(query);
- int length = query.length();
- QStringList sl;
- int cmp;
- /* perform binary search in the .index file */
- do {
- pos = (low + high)/2;
- sl = entryAt(pos);
- cmp = QString::compare(query, locToASCII(sl[0].left(length)), Qt::CaseInsensitive);
- if(low == pos || high == pos) /* no hits */
- return -1;
- if(cmp < 0) { /* a ... query ... sl[0] */
- high = pos;
- }
- else if(cmp > 0) { /* a ... sl[0] ... query */
- low = pos;
- }
- } while(cmp != 0);
- /* go up to find a non-matching entry */
- while(QString::compare(query, locToASCII(sl[0].left(length)), Qt::CaseInsensitive) == 0)
- {
- pos -= max_index_line_length;
- sl = entryAt(pos);
- }
- /* find a matching entry (this ensures it's the first match in the file) */
- QString str;
- while(QString::compare(query, locToASCII(str.left(length)), Qt::CaseInsensitive) != 0)
- {
- pos = streamIndex->pos();
- str = streamIndex->readLine();
- }
- return pos;
- }
- QStringList DictEngine::entryAt(qint64 pos) const /* returns next whole line after closest newline */
- {
- streamIndex->seek(pos);
- streamIndex->readLine();
- return streamIndex->readLine().split("\t");
- }
- QStringList DictEngine::suggestions(QString query)
- {
- // qDebug() << "sugg: " + __LINE__;
- sugg->clear();
- qint64 pos = findFirstEntry(query);
- if(pos == -1) {
- return QStringList();
- }
- streamIndex->seek(pos);
- //DictRecord entry_old, entry;
- QStringList entry_old(QString("")), entry;
- while(sugg->length() < number_of_suggestions) { /* make list of suggestions */
- entry = streamIndex->readLine().split("\t");
- if(entry_old[0] != entry[0]) /* do not append if the previous heading is the same */
- sugg->append(DictRecord(entry[0], entry[1], entry[2]));
- else /* add position and length of the new occurence of the word */
- (*sugg)[(sugg->length() - 1)].occ.append(DictOccurrence(entry[1], entry[2]));
- entry_old = entry;
- }
- QStringList retVal;
- foreach(DictRecord dr, *sugg)
- retVal.append(dr.heading);
- // qDebug() << "sugg: " + __LINE__;
- return retVal;
- }
- QString DictEngine::meaning(int index)
- {
- if(sugg->length() == 0)
- return QString(tr("No hits"));
- int headingLen = sugg->at(index).heading.toUtf8().length() + 1; /* number of bytes in heading plus the newline character */
- QString str, item;
- // item.reserve(1000);
- str = "<h3>" + sugg->at(index).heading + "</h3>";
- int len;
- const QList<DictOccurrence> &doL = sugg->at(index).occ;
- for(int i = 0; i < doL.length(); i++) {
- // foreach(DictOccurrence do, doL) {
- str += QString("<p><b>%1: </b>").arg(i + 1);
- streamDict->seek(base64toInt(doL[i].pos) + headingLen);
- len = base64toInt(doL[i].length) - headingLen;
- item = streamDict->read(len);
- item.truncate(2*len - item.toUtf8().length()); /* get rid of unintentionally read extra characters from the next entry */
- str += item + "</p>";
- }
- return str;
- }
- QString DictEngine::name() const
- {
- QString line;
- streamDict->seek(0);
- int i = 0; /* search for 00-database-short, if it's not found on */
- /* first lines, return file name without extension. */
- while((line != "00-database-short") && (i++ < 20)) {
- line = streamDict->readLine();
- }
- if(i == 20)
- return datName;
- return streamDict->readLine().trimmed();
- }
- QString DictEngine::databaseName() const
- {
- return datName;
- }
- qint64 DictEngine::base64toInt(QString str) const
- {
- int c;
- qint64 val = 0;
- for(int i = 0; (i < str.length()) && (i < sizeof(qint64)); i++) {
- c = static_cast<int>(str.at(str.length() - i - 1).toAscii());
- if((c & 0x80) || (base64_val[c] == -1))
- throw QString("Non-Base64 character was encountered.");
- val |= base64_val[c] << (i * 6);
- }
- if(val >= dictFileHandle->size())
- throw QString("Attempt to access dict file beyond its end.");
- return val;
- }
- QString DictEngine::locToASCII(QString input) const
- {
- return input;
- QString sharp_s = QString::fromLocal8Bit("ß");
- QString norm = input.normalized(QString::NormalizationForm_KD);
- QString output;
- output.reserve(input.length()*2); /* the highest possible needed size is <input length>*2 */
- foreach(const QChar& c, norm) {
- if(c < 0x80)
- output.append(c);
- else if(c == sharp_s.at(0))
- output.append('s');
- }
- return output;
- }
|