Module | NKF |
In: |
nkf/nkf.c
|
Guess Encoding By NKF2.0 Routine
/* * NKF.guess2 * * Guess Encoding By NKF2.0 Routine */ static VALUE rb_nkf_guess2(obj, src) VALUE obj, src; { int code = _BINARY; reinit(); input_ctr = 0; StringValue(src); input = RSTRING(src)->ptr; i_len = RSTRING(src)->len; if(x0201_f == WISH_TRUE) x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201); guess_f = TRUE; kanji_convert( NULL ); guess_f = FALSE; if (!is_inputcode_mixed) { if (strcmp(input_codename, "") == 0) { code = _ASCII; } else if (strcmp(input_codename, "ISO-2022-JP") == 0) { code = _JIS; } else if (strcmp(input_codename, "EUC-JP") == 0) { code = _EUC; } else if (strcmp(input_codename, "Shift_JIS") == 0) { code = _SJIS; } else if (strcmp(input_codename, "UTF-8") == 0) { code = _UTF8; } else if (strcmp(input_codename, "UTF-16") == 0) { code = _UTF16; } else if (strlen(input_codename) > 0) { code = _UNKNOWN; } } return INT2FIX( code ); }
Character code detection - Algorithm described in: Ken Lunde. `Understanding Japanese Information Processing’ Sebastopol, CA: O’Reilly & Associates.
/* * NKF.guess1 * * Character code detection - Algorithm described in: * Ken Lunde. `Understanding Japanese Information Processing' * Sebastopol, CA: O'Reilly & Associates. */ static VALUE rb_nkf_guess1(obj, src) VALUE obj, src; { unsigned char *p; unsigned char *pend; int sequence_counter = 0; StringValue(src); p = RSTRING(src)->ptr; pend = p + RSTRING(src)->len; if (p == pend) return INT2FIX(_UNKNOWN); #define INCR do {\ p++;\ if (p==pend) return INT2FIX(_UNKNOWN);\ sequence_counter++;\ if (sequence_counter % 2 == 1 && *p != 0xa4)\ sequence_counter = 0;\ if (6 <= sequence_counter) {\ sequence_counter = 0;\ return INT2FIX(_EUC);\ }\ } while (0) if (*p == 0xa4) sequence_counter = 1; while (p<pend) { if (*p == '\033') { return INT2FIX(_JIS); } if (*p < '\006' || *p == 0x7f || *p == 0xff) { return INT2FIX(_BINARY); } if (0x81 <= *p && *p <= 0x8d) { return INT2FIX(_SJIS); } if (0x8f <= *p && *p <= 0x9f) { return INT2FIX(_SJIS); } if (*p == 0x8e) { /* SS2 */ INCR; if ((0x40 <= *p && *p <= 0x7e) || (0x80 <= *p && *p <= 0xa0) || (0xe0 <= *p && *p <= 0xfc)) return INT2FIX(_SJIS); } else if (0xa1 <= *p && *p <= 0xdf) { INCR; if (0xf0 <= *p && *p <= 0xfe) return INT2FIX(_EUC); if (0xe0 <= *p && *p <= 0xef) { while (p < pend && *p >= 0x40) { if (*p >= 0x81) { if (*p <= 0x8d || (0x8f <= *p && *p <= 0x9f)) { return INT2FIX(_SJIS); } else if (0xfd <= *p && *p <= 0xfe) { return INT2FIX(_EUC); } } INCR; } } else if (*p <= 0x9f) { return INT2FIX(_SJIS); } } else if (0xf0 <= *p && *p <= 0xfe) { return INT2FIX(_EUC); } else if (0xe0 <= *p && *p <= 0xef) { INCR; if ((0x40 <= *p && *p <= 0x7e) || (0x80 <= *p && *p <= 0xa0)) { return INT2FIX(_SJIS); } if (0xfd <= *p && *p <= 0xfe) { return INT2FIX(_EUC); } } INCR; } return INT2FIX(_UNKNOWN); }
Guess Encoding By NKF2.0 Routine
/* * NKF.guess2 * * Guess Encoding By NKF2.0 Routine */ static VALUE rb_nkf_guess2(obj, src) VALUE obj, src; { int code = _BINARY; reinit(); input_ctr = 0; StringValue(src); input = RSTRING(src)->ptr; i_len = RSTRING(src)->len; if(x0201_f == WISH_TRUE) x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201); guess_f = TRUE; kanji_convert( NULL ); guess_f = FALSE; if (!is_inputcode_mixed) { if (strcmp(input_codename, "") == 0) { code = _ASCII; } else if (strcmp(input_codename, "ISO-2022-JP") == 0) { code = _JIS; } else if (strcmp(input_codename, "EUC-JP") == 0) { code = _EUC; } else if (strcmp(input_codename, "Shift_JIS") == 0) { code = _SJIS; } else if (strcmp(input_codename, "UTF-8") == 0) { code = _UTF8; } else if (strcmp(input_codename, "UTF-16") == 0) { code = _UTF16; } else if (strlen(input_codename) > 0) { code = _UNKNOWN; } } return INT2FIX( code ); }