Monday, January 06, 2014

C/C++ Function to convert Unicode Japanese Katakana to Hiragana

I wrote a little tool which allows to add new words to the dictionary of the Chrome Extension Rikaikun (inspired by rikaichan for Firefox) and then recompile the index file.
The index file stores all spellings in hiragana, so this was needed.
Fortunately, the rikaikun source code already had this code ready, albeit in JavaScript. The function writes the result to r and returns the end of the converted string.


// modified from Chrome extension rikaikun, file data.ja
wchar_t* katakana_to_hiragana(wchar_t* word, wchar_t* r) {
    // katakana -> hiragana conversion tables
    wchar_t ch[] = { 0x3092, 0x3041, 0x3043, 0x3045, 0x3047, 0x3049, 0x3083, 0x3085, 0x3087, 0x3063, 0x30FC, 0x3042, 0x3044, 0x3046,
    0x3048, 0x304A, 0x304B, 0x304D, 0x304F, 0x3051, 0x3053, 0x3055, 0x3057, 0x3059, 0x305B, 0x305D, 0x305F, 0x3061,
    0x3064, 0x3066, 0x3068, 0x306A, 0x306B, 0x306C, 0x306D, 0x306E, 0x306F, 0x3072, 0x3075, 0x3078, 0x307B, 0x307E,
    0x307F, 0x3080, 0x3081, 0x3082, 0x3084, 0x3086, 0x3088, 0x3089, 0x308A, 0x308B, 0x308C, 0x308D, 0x308F, 0x3093 };
    wchar_t cv[] = { 0x30F4, 0xFF74, 0xFF75, 0x304C, 0x304E, 0x3050, 0x3052, 0x3054, 0x3056, 0x3058, 0x305A, 0x305C, 0x305E, 0x3060,
    0x3062, 0x3065, 0x3067, 0x3069, 0xFF85, 0xFF86, 0xFF87, 0xFF88, 0xFF89, 0x3070, 0x3073, 0x3076, 0x3079, 0x307C }; 
    wchar_t cs[] = { 0x3071, 0x3074, 0x3077, 0x307A, 0x307D };


   // half & full-width katakana to hiragana conversion
   // note: katakana vu is never converted to hiragana
   int i = 0, u = 0, v = 0, p = 0; // todo: none of these is read before written exc p so this is not needed
   
   for (; *word; ++word) {
       u = v = (unsigned)*word;

       //if (u <= 0x3000) break; // should never happen, our spelling is always hira

       //if (u > 0x3000){ // like so? how do they get a string if they don't have

       // full-width katakana to hiragana
       if ((u >= 0x30A1) && (u <= 0x30F3)) {
           u -= 0x60;
       }
       // half-width katakana to hiragana
       else if ((u >= 0xFF66) && (u <= 0xFF9D)) {
           u = ch[u - 0xFF66];
       }
       // voiced (used in half-width katakana) to hiragana
       else if (u == 0xFF9E) {
           if ((p >= 0xFF73) && (p <= 0xFF8E)) {
               r--;//r = r.substr(0, r.length() - 1);
               u = cv[p - 0xFF73];
           }
       }
       // semi-voiced (used in half-width katakana) to hiragana
       else if (u == 0xFF9F) {
           if ((p >= 0xFF8A) && (p <= 0xFF8E)) {
               r--; //r = r.substr(0, r.length() - 1);
               u = cs[p - 0xFF8A];
           }
       }
       // ignore J~
       else if (u == 0xFF5E) {
           p = 0;
           continue;
       }

       *r++ = (wchar_t)u;
       //trueLen[r.length] = i + 1; // need to keep real length because of the half-width semi/voiced conversion // do we?
       p = v;
   }
   *r = 0;
   return r;
}