[groonga-commit:1231] ranguba/chupatext [master] * module/chupa_msword.c (chupa_msword_decomposer_feed): implemented.
null+ranguba at clear-code.com
null+ranguba at clear-code.com
Thu Sep 9 04:06:34 EDT 2010
Nobuyoshi Nakada 2010-09-09 08:06:34 +0000 (Thu, 09 Sep 2010)
New Revision: df975273292752c5dc885093d4160c823cc8fa1d
Log:
* module/chupa_msword.c (chupa_msword_decomposer_feed): implemented.
Modified files:
module/chupa_msword.c
Modified: module/chupa_msword.c (+122 -1)
===================================================================
--- module/chupa_msword.c 2010-09-09 08:03:56 +0000 (d03e48b)
+++ module/chupa_msword.c 2010-09-09 08:06:34 +0000 (8e43a81)
@@ -36,10 +36,131 @@ struct _ChupaMSWORDDecomposerClass
static GType chupa_type_msword_decomposer = 0;
+struct char_proc_arg {
+ GString *buffer;
+ GMemoryInputStream *dest;
+ ChupaText *chupar;
+ ChupaTextInput *input;
+ const char *encoding;
+ wvParseStruct ps;
+};
+
+static int
+char_proc(wvParseStruct *ps, U16 eachchar, U8 chartype, U16 lid)
+{
+ struct char_proc_arg *arg = ps->userData;
+ GString *s = arg->buffer;
+ gboolean first = FALSE;
+
+ if (!arg->encoding) {
+ ChupaMetadata *meta = chupa_text_input_get_metadata(arg->input);
+ if (chartype) {
+ arg->encoding = wvLIDToCodePageConverter(lid);
+ }
+ else {
+ arg->encoding = "UTF-8";
+ }
+ chupa_metadata_add_value(meta, "charset", arg->encoding);
+ first = TRUE;
+ }
+
+ /* take care of any oddities in Microsoft's character "encoding" */
+ /* TODO: does the above code page handler take care of these? */
+ if (chartype == 1 && eachchar == 146)
+ eachchar = 39; /* apostrophe */
+
+ switch (eachchar) {
+ case 13: /* paragraph end */
+ case 11: /* hard line break */
+ eachchar = '\n';
+ break;
+
+ case 12: /* page breaks, section marks */
+ eachchar = '\f';
+ break;
+
+ case 14: /* column break */
+ eachchar = '\t';
+ break;
+
+ case 19: /* field begin */
+ /* flush current text buffer */
+ ps->fieldstate++;
+ ps->fieldmiddle = 0;
+ return 0;
+ case 20: /* field separator */
+ ps->fieldmiddle = 1;
+ return 0;
+ case 21: /* field end */
+ ps->fieldstate--;
+ ps->fieldmiddle = 0;
+ return 0;
+
+ default:
+ break;
+ }
+
+ /* todo: properly handle fields */
+ if (eachchar == 0x13 || eachchar == 0x14)
+ return 0;
+
+ if (!s) {
+ s = g_string_new(0);
+ arg->buffer = s;
+ }
+
+ /* convert incoming character to unicode */
+ if (chartype) {
+ g_string_append_c(s, eachchar);
+ }
+ else {
+ g_string_append_unichar(s, eachchar);
+ }
+
+ if (eachchar == '\f') {
+ GMemoryInputStream *stream;
+ stream = G_MEMORY_INPUT_STREAM(arg->dest);
+ g_memory_input_stream_add_data(stream, s->str, s->len, g_free);
+ g_string_free(s, FALSE);
+ arg->buffer = NULL;
+ if (first) {
+ chupa_text_decomposed(arg->chupar, arg->input);
+ }
+ }
+ return 0;
+}
+
static void
chupa_msword_decomposer_feed(ChupaDecomposer *dec, ChupaText *chupar, ChupaTextInput *input)
{
+ struct char_proc_arg arg;
GsfInput *gi = chupa_text_input_get_base_input(input);
+
+ arg.buffer = NULL;
+ arg.dest = G_MEMORY_INPUT_STREAM(g_memory_input_stream_new());
+ arg.chupar = chupar;
+ arg.input = chupa_text_input_new_from_stream(NULL, G_INPUT_STREAM(arg.dest),
+ gsf_input_name(gi));
+ arg.encoding = NULL;
+
+ wvInitParser_gsf(&arg.ps, gi);
+ arg.ps.userData = &arg;
+ wvSetCharHandler(&arg.ps, char_proc);
+ wvText(&arg.ps);
+ wvOLEFree(&arg.ps);
+ if (arg.buffer) {
+ GString *s = arg.buffer;
+ g_memory_input_stream_add_data(arg.dest, s->str, s->len, g_free);
+ g_string_free(s, FALSE);
+ arg.buffer = NULL;
+ if (!arg.encoding) {
+ ChupaMetadata *meta = chupa_text_input_get_metadata(arg.input);
+ chupa_metadata_add_value(meta, "charset", "US-ASCII");
+ chupa_text_decomposed(chupar, arg.input);
+ }
+ }
+ g_object_unref(arg.dest);
+ g_object_unref(arg.input);
}
static void
@@ -71,8 +192,8 @@ register_type(GTypeModule *type_module)
"ChupaMSWORDDecomposer",
&info, 0);
chupa_type_msword_decomposer = type;
+ chupa_decomposer_register("application/msword", type);
chupa_decomposer_register("application/x-msword", type);
- chupa_decomposer_register("application/x-ole-storage", type);
}
return type;
}
More information about the groonga-commit
mailing list