summaryrefslogtreecommitdiffstats
path: root/src/recode.c
diff options
context:
space:
mode:
authorSuren A. Chilingaryan <csa@dside.dyndns.org>2005-08-02 04:43:01 +0000
committerSuren A. Chilingaryan <csa@dside.dyndns.org>2005-08-02 04:43:01 +0000
commit8b75f9bb6a09d54d634ff661655659951378aa2c (patch)
tree611f800f33ca919d37c724957fcc8c2b2bccb342 /src/recode.c
parente3f702e83a26468ee44f3f342a7a40a252f4603c (diff)
downloadlibrcc-8b75f9bb6a09d54d634ff661655659951378aa2c.tar.gz
librcc-8b75f9bb6a09d54d634ff661655659951378aa2c.tar.bz2
librcc-8b75f9bb6a09d54d634ff661655659951378aa2c.tar.xz
librcc-8b75f9bb6a09d54d634ff661655659951378aa2c.zip
Language autodetection
- rccConfigRecode function's are added - Language autodetection using aspell is added - Translation in 3 modes: "To English Language", "Skip English Translation", "Full" - Example2 to demonstrate translation possibilities
Diffstat (limited to 'src/recode.c')
-rw-r--r--src/recode.c233
1 files changed, 205 insertions, 28 deletions
diff --git a/src/recode.c b/src/recode.c
index c44095c..7e12343 100644
--- a/src/recode.c
+++ b/src/recode.c
@@ -2,6 +2,8 @@
#include <stdlib.h>
#include <string.h>
+#include "../config.h"
+
#include "internal.h"
#include "rcciconv.h"
#include "fs.h"
@@ -10,19 +12,140 @@
#include "rccconfig.h"
#include "rccdb4.h"
#include "rcctranslate.h"
+#include "rccspell.h"
+
+#define isSpace(ch) ((ch<0x7F)&&((ch<'A')||(ch>'z')||((ch>'Z')&&(ch<'a'))))
+#define RCC_REQUIRED_PROBABILITY 0.66
+
+rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len, rcc_string *retstring) {
+ rcc_speller speller;
+ unsigned long i, nlanguages;
+ rcc_language_config config, config0 = NULL;
+ rcc_string recoded;
+ unsigned char *utf8;
+ size_t j, mode;
+ unsigned long words, english, result;
+ unsigned char english_mode, english_word = 1;
+ rcc_language_id english_lang = (rcc_language_id)-1;
+ double res, english_res = 0;
+ rcc_option_value usedb4;
+
+
+ usedb4 = rccGetOption(ctx, RCC_OPTION_LEARNING_MODE);
+
+ if (usedb4&RCC_OPTION_LEARNING_FLAG_USE) {
+ recoded = rccDb4GetKey(ctx->db4ctx, buf, len);
+ if (recoded) {
+ if (rccStringFixID(recoded, ctx)) free(recoded);
+ else {
+ english_lang = rccStringGetLanguage(recoded);
+ if (retstring) *retstring = recoded;
+ else free(recoded);
+ return english_lang;
+ }
+ }
+ }
+
+ if (!rccGetOption(ctx, RCC_OPTION_AUTODETECT_LANGUAGE)) return (rcc_language_id)-1;
+
+ nlanguages = ctx->n_languages;
+
+ for (i=0;i<nlanguages;i++) {
+ config = rccGetUsableConfig(ctx, (rcc_language_id)i);
+ if (!config) continue;
+
+ if (i) {
+ if (config==config0) continue;
+ } else config0=config;
+
+ speller = rccConfigGetSpeller(config);
+ if (rccSpellerGetError(speller)) continue;
+
+ recoded = rccConfigSizedFrom(config, class_id, buf, len);
+ if (!recoded) continue;
+
+ if (!strcasecmp(config->language->sn, rcc_english_language_sn)) english_mode = 1;
+ else english_mode = 0;
+
+ utf8 = (char*)rccStringGetString(recoded);
+ for (result=0,english=0,words=0,mode=0,j=0;utf8[j];j++) {
+ if (isSpace(utf8[j])) {
+ if (mode) {
+ if ((!english_mode)&&(english_word)) english++;
+ result+=rccSpellerSized(speller, utf8 + mode - 1, j - mode + 1)?1:0;
+ words++;
+ mode = 0;
+ } else continue;
+ } else {
+ if (mode) {
+ if (utf8[j]>0x7F) english_word = 0;
+ } else {
+ mode = j + 1;
+ english_word = 1;
+ }
+ }
+ }
+ if (mode) {
+ result+=rccSpeller(speller, utf8 + mode - 1)?1:0;
+ words++;
+ }
+
+ if (english_mode) {
+ english_res = 1.*result/words;
+ english_lang = (rcc_language_id)i;
+ } else if (words) {
+ res = 1.*result/words;
+ if (res > RCC_REQUIRED_PROBABILITY) {
+ if (retstring) *retstring = recoded;
+ else free(recoded);
+ return (rcc_language_id)i;
+ }
+ if (words > english) {
+ res = 1.*(result - english)/(words - english);
+ if (res > RCC_REQUIRED_PROBABILITY) {
+ if (retstring) *retstring = recoded;
+ else free(recoded);
+ return (rcc_language_id)i;
+ }
+ }
+ }
+
+ free(recoded);
+ }
+
+ if (english_res > RCC_REQUIRED_PROBABILITY) {
+ if (retstring) {
+ *retstring = rccCreateString(english_lang, buf, len);
+ }
+ return english_lang;
+ }
+
+ return (rcc_language_id)-1;
+}
+rcc_language_id rccDetectLanguage(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len) {
+ if (!ctx) {
+ if (rcc_default_ctx) ctx = rcc_default_ctx;
+ else return -1;
+ }
+
+ return rccDetectLanguageInternal(ctx, class_id, buf, len, NULL);
+}
-static rcc_autocharset_id rccIConvAuto(rcc_context ctx, rcc_class_id class_id, const char *buf, int len) {
+rcc_autocharset_id rccConfigDetectCharset(rcc_language_config config, rcc_class_id class_id, const char *buf, size_t len) {
+ rcc_context ctx;
rcc_class_type class_type;
rcc_engine_ptr engine;
- if (!buf) return (rcc_autocharset_id)-1;
+ if ((!buf)||(!config)) return (rcc_autocharset_id)-1;
+
+ ctx = config->ctx;
class_type = rccGetClassType(ctx, class_id);
if ((class_type != RCC_CLASS_FS)||((class_type == RCC_CLASS_FS)&&(rccGetOption(ctx, RCC_OPTION_AUTODETECT_FS_TITLES)))) {
- engine = rccGetCurrentEnginePointer(ctx);
+ engine = rccConfigGetCurrentEnginePointer(config);
if ((!engine)||(!engine->func)) return (rcc_autocharset_id)-1;
return engine->func(&ctx->engine_ctx, buf, len);
}
@@ -30,16 +153,26 @@ static rcc_autocharset_id rccIConvAuto(rcc_context ctx, rcc_class_id class_id, c
return (rcc_autocharset_id)-1;
}
+int rccDetectCharset(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len) {
+ if (!ctx) {
+ if (rcc_default_ctx) ctx = rcc_default_ctx;
+ else return -1;
+ }
+
+ return rccConfigDetectCharset(ctx->current_config, class_id, buf, len);
+}
+
+
rcc_string rccSizedFrom(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len) {
int err;
size_t ret;
- rcc_language_id language_id;
+ rcc_language_id language_id, detected_language_id;
rcc_autocharset_id charset_id;
rcc_iconv icnv = NULL;
rcc_string result;
rcc_option_value usedb4;
const char *charset;
-
+
if (!ctx) {
if (rcc_default_ctx) ctx = rcc_default_ctx;
else return NULL;
@@ -52,10 +185,11 @@ rcc_string rccSizedFrom(rcc_context ctx, rcc_class_id class_id, const char *buf,
language_id = rccGetCurrentLanguage(ctx);
if (language_id == (rcc_language_id)-1) return NULL;
- if (!strcasecmp(ctx->languages[language_id]->sn, "off")) return NULL;
+ if (!strcasecmp(ctx->languages[language_id]->sn, rcc_disabled_language_sn)) return NULL;
- usedb4 = rccGetOption(ctx, RCC_OPTION_LEARNING_MODE);
+ usedb4 = rccGetOption(ctx, RCC_OPTION_LEARNING_MODE);
+/*
if (usedb4&RCC_OPTION_LEARNING_FLAG_USE) {
result = rccDb4GetKey(ctx->db4ctx, buf, len);
if (result) {
@@ -63,11 +197,22 @@ rcc_string rccSizedFrom(rcc_context ctx, rcc_class_id class_id, const char *buf,
else return result;
}
}
+
+ if (rccGetOption(ctx, RCC_OPTION_AUTODETECT_LANGUAGE)) {
+ detected_language_id = rccDetectLanguageInternal(ctx, class_id, buf, len);
+ if (detected_language_id != (rcc_language_id)-1)
+ language_id = detected_language_id;
+ }
+*/
+
+ detected_language_id = rccDetectLanguageInternal(ctx, class_id, buf, len, &result);
+ if (detected_language_id != (rcc_language_id)-1) return result;
+
err = rccConfigure(ctx);
if (err) return NULL;
- charset_id = rccIConvAuto(ctx, class_id, buf, len);
+ charset_id = rccDetectCharset(ctx, class_id, buf, len);
if (charset_id != (rcc_autocharset_id)-1) {
icnv = ctx->iconv_auto[charset_id];
if (rccGetOption(ctx, RCC_OPTION_AUTOENGINE_SET_CURRENT)) {
@@ -105,6 +250,9 @@ char *rccSizedTo(rcc_context ctx, rcc_class_id class_id, rcc_const_string buf, s
rcc_language_id language_id;
rcc_language_id current_language_id;
rcc_class_type class_type;
+ rcc_option_value translate;
+ const char *langname;
+ unsigned char english_source;
rcc_iconv icnv;
if (!ctx) {
@@ -127,33 +275,60 @@ char *rccSizedTo(rcc_context ctx, rcc_class_id class_id, rcc_const_string buf, s
if (err) return NULL;
class_type = rccGetClassType(ctx, class_id);
- if ((class_type != RCC_CLASS_FS)&&(rccGetOption(ctx, RCC_OPTION_TRANSLATE))) {
+ translate = rccGetOption(ctx, RCC_OPTION_TRANSLATE);
+
+ langname = rccGetLanguageName(ctx, language_id);
+ if (strcasecmp(langname, rcc_english_language_sn)) english_source = 0;
+ else english_source = 1;
+
+ if ((class_type != RCC_CLASS_FS)&&((translate==RCC_OPTION_TRANSLATE_FULL)||((translate)&&(!english_source)))) {
current_language_id = rccGetCurrentLanguage(ctx);
if (current_language_id != language_id) {
if ((config->trans)&&(config->translang != current_language_id)) {
rccTranslateClose(config->trans);
config->trans = NULL;
}
- if (!config->trans) {
- config->trans = rccTranslateOpen(rccGetLanguageName(ctx, language_id), rccGetLanguageName(ctx, current_language_id));
- config->translang = current_language_id;
+
+ if (translate != RCC_OPTION_TRANSLATE_TO_ENGLISH) {
+ if (!config->trans) {
+ config->trans = rccTranslateOpen(rccGetLanguageName(ctx, language_id), rccGetLanguageName(ctx, current_language_id));
+ config->translang = current_language_id;
+ }
+
+ if (config->trans) {
+ translated = rccTranslate(config->trans, utfstring);
+ if (translated) {
+ language_id = current_language_id;
+
+ config = rccGetConfig(ctx, language_id);
+ if (!config) {
+ free(translated);
+ return NULL;
+ }
+
+ err = rccConfigConfigure(config);
+ if (err) {
+ free(translated);
+ return NULL;
+ }
+ }
+ }
}
- if (config->trans) {
- translated = rccTranslate(config->trans, utfstring);
- if (translated) {
- language_id = current_language_id;
+
+ if ((translate == RCC_OPTION_TRANSLATE_TO_ENGLISH)||((config->trans)&&(!translated))) {
+ puts("entrans");
+ if (!config->entrans) {
+ config->entrans = rccTranslateOpen(rccGetLanguageName(ctx, language_id), rcc_english_language_sn);
+ }
+ if (config->entrans) {
+ translated = rccTranslate(config->entrans, utfstring);
+
config = rccGetConfig(ctx, language_id);
- if (!config) {
- free(translated);
- return NULL;
- }
+ if (!config) return translated;
err = rccConfigConfigure(config);
- if (err) {
- free(translated);
- return NULL;
- }
+ if (err) return translated;
}
}
}
@@ -183,7 +358,7 @@ char *rccSizedTo(rcc_context ctx, rcc_class_id class_id, rcc_const_string buf, s
icnv = config->iconv_to[class_id];
if (icnv) {
- newlen = rccIConvInternal(ctx, icnv, translated?translated:utfstring, newlen);
+ newlen = rccIConvInternal(ctx, icnv, translated?translated:utfstring, translated?0:newlen);
if (translated) free(translated);
if (newlen == (size_t)-1) return NULL;
@@ -237,7 +412,7 @@ char *rccSizedRecode(rcc_context ctx, rcc_class_id from, rcc_class_id to, const
err = rccConfigure(ctx);
if (err) return NULL;
- from_charset_id = rccIConvAuto(ctx, from, buf, len);
+ from_charset_id = rccDetectCharset(ctx, from, buf, len);
if (from_charset_id != (rcc_charset_id)-1) {
from_charset = rccGetAutoCharsetName(ctx, from_charset_id);
to_charset = rccGetCurrentCharsetName(ctx, to);
@@ -385,13 +560,15 @@ char *rccSizedRecodeToCharset(rcc_context ctx, rcc_class_id class_id, const char
return extracted;
}
-/* Convert to class_id from Charset */
+/* Convert to class_id from Charset.
+Usage of this function assuming the knowledge about the incoming string.
+The charset as well as the language. So no detection (DB4,Aspell) of language
+will be performed. */
char *rccSizedRecodeFromCharset(rcc_context ctx, rcc_class_id class_id, const char *charset, const char *buf, size_t len, size_t *rlen) {
size_t res;
rcc_iconv icnv;
rcc_string str;
char *extracted;
-
if (!charset) return NULL;