UTF-8 conversion functions designed for use in PHP-GTK apps Adam Rambousek - rambousek@volny.cz version history: 1.03 --- 12/02/2002 * added Win1257 support 1.02 --- 30/11/2001 * added ISO8859-1 support 1.01 * added Win1250 support 1.00 string to_utf8(string string [, string charset]) string from_utf8(string string [, string charset]) supported charsets: name of charset you must use in script ISO8859-2: iso2 (this is the default charset, you don't have to specify it) Windows1250: win1250 ISO8859-1: iso1 Windows1257: win1257 example: $new_string=to_utf8($some_string,"win1250"); */ /* translation table - actually, it's array where key is hexadecimal number of character in ISO8859-2/Windows1250 and value is its two byte representation in UTF-8 */ $iso2_utf8 = array( 0x80=>"\xc2\x80", 0x81=>"\xc2\x81", 0x82=>"\xc2\x82", 0x83=>"\xc2\x83", 0x84=>"\xc2\x84", 0x85=>"\xc2\x85", 0x86=>"\xc2\x86", 0x87=>"\xc2\x87", 0x88=>"\xc2\x88", 0x89=>"\xc2\x89", 0x8A=>"\xc2\x8a", 0x8B=>"\xc2\x8b", 0x8C=>"\xc2\x8c", 0x8D=>"\xc2\x8d", 0x8E=>"\xc2\x8e", 0x8F=>"\xc2\x8f", 0x90=>"\xc2\x90", 0x91=>"\xc2\x91", 0x92=>"\xc2\x92", 0x93=>"\xc2\x93", 0x94=>"\xc2\x94", 0x95=>"\xc2\x95", 0x96=>"\xc2\x96", 0x97=>"\xc2\x97", 0x98=>"\xc2\x98", 0x99=>"\xc2\x99", 0x9A=>"\xc2\x9a", 0x9B=>"\xc2\x9b", 0x9C=>"\xc2\x9c", 0x9D=>"\xc2\x9d", 0x9E=>"\xc2\x9e", 0x9F=>"\xc2\x9f", 0xA0=>"\xc2\xa0", 0xA1=>"\xc4\x84", 0xA2=>"\xcb\x98", 0xA3=>"\xc5\x81", 0xA4=>"\xc2\xa4", 0xA5=>"\xc4\xbd", 0xA6=>"\xc5\x9a", 0xA7=>"\xc2\xa7", 0xA8=>"\xc2\xa8", 0xA9=>"\xc5\xa0", 0xAA=>"\xc5\x9e", 0xAB=>"\xc5\xa4", 0xAC=>"\xc5\xb9", 0xAD=>"\xc2\xad", 0xAE=>"\xc5\xbd", 0xAF=>"\xc5\xbb", 0xB0=>"\xc2\xb0", 0xB1=>"\xc4\x85", 0xB2=>"\xcb\x9b", 0xB3=>"\xc5\x82", 0xB4=>"\xc2\xb4", 0xB5=>"\xc4\xbe", 0xB6=>"\xc5\x9b", 0xB7=>"\xcb\x87", 0xB8=>"\xc2\xb8", 0xB9=>"\xc5\xa1", 0xBA=>"\xc5\x9f", 0xBB=>"\xc5\xa5", 0xBC=>"\xc5\xba", 0xBD=>"\xcb\x9d", 0xBE=>"\xc5\xbe", 0xBF=>"\xc5\xbc", 0xC0=>"\xc5\x94", 0xC1=>"\xc3\x81", 0xC2=>"\xc3\x82", 0xC3=>"\xc4\x82", 0xC4=>"\xc3\x84", 0xC5=>"\xc4\xb9", 0xC6=>"\xc4\x86", 0xC7=>"\xc3\x87", 0xC8=>"\xc4\x8c", 0xC9=>"\xc3\x89", 0xCA=>"\xc4\x98", 0xCB=>"\xc3\x8b", 0xCC=>"\xc4\x9a", 0xCD=>"\xc3\x8d", 0xCE=>"\xc3\x8e", 0xCF=>"\xc4\x8e", 0xD0=>"\xc4\x90", 0xD1=>"\xc5\x83", 0xD2=>"\xc5\x87", 0xD3=>"\xc3\x93", 0xD4=>"\xc3\x94", 0xD5=>"\xc5\x90", 0xD6=>"\xc3\x96", 0xD7=>"\xc3\x97", 0xD8=>"\xc5\x98", 0xD9=>"\xc5\xae", 0xDA=>"\xc3\x9a", 0xDB=>"\xc5\xb0", 0xDC=>"\xc3\x9c", 0xDD=>"\xc3\x9d", 0xDE=>"\xc5\xa2", 0xDF=>"\xc3\x9f", 0xE0=>"\xc5\x95", 0xE1=>"\xc3\xa1", 0xE2=>"\xc3\xa2", 0xE3=>"\xc4\x83", 0xE4=>"\xc3\xa4", 0xE5=>"\xc4\xba", 0xE6=>"\xc4\x87", 0xE7=>"\xc3\xa7", 0xE8=>"\xc4\x8d", 0xE9=>"\xc3\xa9", 0xEA=>"\xc4\x99", 0xEB=>"\xc3\xab", 0xEC=>"\xc4\x9b", 0xED=>"\xc3\xad", 0xEE=>"\xc3\xae", 0xEF=>"\xc4\x8f", 0xF0=>"\xc4\x91", 0xF1=>"\xc5\x84", 0xF2=>"\xc5\x88", 0xF3=>"\xc3\xb3", 0xF4=>"\xc3\xb4", 0xF5=>"\xc5\x91", 0xF6=>"\xc3\xb6", 0xF7=>"\xc3\xb7", 0xF8=>"\xc5\x99", 0xF9=>"\xc5\xaf", 0xFA=>"\xc3\xba", 0xFB=>"\xc5\xb1", 0xFC=>"\xc3\xbc", 0xFD=>"\xc3\xbd", 0xFE=>"\xc5\xa3", 0xFF=>"\xcb\x99" ); $win1250_utf8 = array( 0x80=>"\xc2\x80", 0x81=>"\xc2\x81", 0x82=>"\x140\x9a", 0x83=>"\xc2\x83", 0x84=>"\x140\x9e", 0x85=>"\x140\xa6", 0x86=>"\x140\xa0", 0x87=>"\x140\xa1", 0x88=>"\xc2\x88", 0x89=>"\x140\xb0", 0x8a=>"\xc5\xa0", 0x8b=>"\x140\xb9", 0x8c=>"\xc5\x9a", 0x8d=>"\xc5\xa4", 0x8e=>"\xc5\xbd", 0x8f=>"\xc5\xb9", 0x90=>"\xc2\x90", 0x91=>"\x140\x98", 0x92=>"\x140\x99", 0x93=>"\x140\x9c", 0x94=>"\x140\x9d", 0x95=>"\x140\xa2", 0x96=>"\x140\x93", 0x97=>"\x140\x94", 0x98=>"\xc2\x98", 0x99=>"\x144\xa2", 0x9a=>"\xc5\xa1", 0x9b=>"\x140\xba", 0x9c=>"\xc5\x9b", 0x9d=>"\xc5\xa5", 0x9e=>"\xc5\xbe", 0x9f=>"\xc5\xba", 0xa0=>"\xc2\xa0", 0xa1=>"\xcb\x87", 0xa2=>"\xcb\x98", 0xa3=>"\xc5\x81", 0xa4=>"\xc2\xa4", 0xa5=>"\xc4\x84", 0xa6=>"\xc2\xa6", 0xa7=>"\xc2\xa7", 0xa8=>"\xc2\xa8", 0xa9=>"\xc2\xa9", 0xaa=>"\xc5\x9e", 0xab=>"\xc2\xab", 0xac=>"\xc2\xac", 0xad=>"\xc2\xad", 0xae=>"\xc2\xae", 0xaf=>"\xc5\xbb", 0xb0=>"\xc2\xb0", 0xb1=>"\xc2\xb1", 0xb2=>"\xcb\x9b", 0xb3=>"\xc5\x82", 0xb4=>"\xc2\xb4", 0xb5=>"\xc2\xb5", 0xb6=>"\xc2\xb6", 0xb7=>"\xc2\xb7", 0xb8=>"\xc2\xb8", 0xb9=>"\xc4\x85", 0xba=>"\xc5\x9f", 0xbb=>"\xc2\xbb", 0xbc=>"\xc4\xbd", 0xbd=>"\xcb\x9d", 0xbe=>"\xc4\xbe", 0xbf=>"\xc5\xbc", 0xc0=>"\xc5\x94", 0xc1=>"\xc3\x81", 0xc2=>"\xc3\x82", 0xc3=>"\xc4\x82", 0xc4=>"\xc3\x84", 0xc5=>"\xc4\xb9", 0xc6=>"\xc4\x86", 0xc7=>"\xc3\x87", 0xc8=>"\xc4\x8c", 0xc9=>"\xc3\x89", 0xca=>"\xc4\x98", 0xcb=>"\xc3\x8b", 0xcc=>"\xc4\x9a", 0xcd=>"\xc3\x8d", 0xce=>"\xc3\x8e", 0xcf=>"\xc4\x8e", 0xd0=>"\xc4\x90", 0xd1=>"\xc5\x83", 0xd2=>"\xc5\x87", 0xd3=>"\xc3\x93", 0xd4=>"\xc3\x94", 0xd5=>"\xc5\x90", 0xd6=>"\xc3\x96", 0xd7=>"\xc3\x97", 0xd8=>"\xc5\x98", 0xd9=>"\xc5\xae", 0xda=>"\xc3\x9a", 0xdb=>"\xc5\xb0", 0xdc=>"\xc3\x9c", 0xdd=>"\xc3\x9d", 0xde=>"\xc5\xa2", 0xdf=>"\xc3\x9f", 0xe0=>"\xc5\x95", 0xe1=>"\xc3\xa1", 0xe2=>"\xc3\xa2", 0xe3=>"\xc4\x83", 0xe4=>"\xc3\xa4", 0xe5=>"\xc4\xba", 0xe6=>"\xc4\x87", 0xe7=>"\xc3\xa7", 0xe8=>"\xc4\x8d", 0xe9=>"\xc3\xa9", 0xea=>"\xc4\x99", 0xeb=>"\xc3\xab", 0xec=>"\xc4\x9b", 0xed=>"\xc3\xad", 0xee=>"\xc3\xae", 0xef=>"\xc4\x8f", 0xf0=>"\xc4\x91", 0xf1=>"\xc5\x84", 0xf2=>"\xc5\x88", 0xf3=>"\xc3\xb3", 0xf4=>"\xc3\xb4", 0xf5=>"\xc5\x91", 0xf6=>"\xc3\xb6", 0xf7=>"\xc3\xb7", 0xf8=>"\xc5\x99", 0xf9=>"\xc5\xaf", 0xfa=>"\xc3\xba", 0xfb=>"\xc5\xb1", 0xfc=>"\xc3\xbc", 0xfd=>"\xc3\xbd", 0xfe=>"\xc5\xa3", 0xff=>"\xcb\x99" ); $iso1_utf8 = array( 0xA0=>"\xc2\xa0", 0xA1=>"\xc2\xa1", 0xA2=>"\xc2\xa2", 0xA3=>"\xc2\xa3", 0xA4=>"\xc2\xa4", 0xA5=>"\xc2\xa5", 0xA6=>"\xc2\xa6", 0xA7=>"\xc2\xa7", 0xA8=>"\xc2\xa8", 0xA9=>"\xc2\xa9", 0xAA=>"\xc2\xaa", 0xAB=>"\xc2\xab", 0xAC=>"\xc2\xac", 0xAD=>"\xc2\xad", 0xAE=>"\xc2\xae", 0xAF=>"\xc2\xaf", 0xB0=>"\xc2\xb0", 0xB1=>"\xc2\xb1", 0xB2=>"\xc2\xb2", 0xB3=>"\xc2\xb3", 0xB4=>"\xc2\xb4", 0xB5=>"\xc2\xb5", 0xB6=>"\xc2\xb6", 0xB7=>"\xc2\xb7", 0xB8=>"\xc2\xb8", 0xB9=>"\xc2\xb9", 0xBA=>"\xc2\xba", 0xBB=>"\xc2\xbb", 0xBC=>"\xc2\xbc", 0xBD=>"\xc2\xbd", 0xBE=>"\xc2\xbe", 0xBF=>"\xc2\xbf", 0xC0=>"\xc3\x80", 0xC1=>"\xc3\x81", 0xC2=>"\xc3\x82", 0xC3=>"\xc3\x83", 0xC4=>"\xc3\x84", 0xC5=>"\xc3\x85", 0xC6=>"\xc3\x86", 0xC7=>"\xc3\x87", 0xC8=>"\xc3\x88", 0xC9=>"\xc3\x89", 0xCA=>"\xc3\x8a", 0xCB=>"\xc3\x8b", 0xCC=>"\xc3\x8c", 0xCD=>"\xc3\x8d", 0xCE=>"\xc3\x8e", 0xCF=>"\xc3\x8f", 0xD0=>"\xc3\x90", 0xD1=>"\xc3\x91", 0xD2=>"\xc3\x92", 0xD3=>"\xc3\x93", 0xD4=>"\xc3\x94", 0xD5=>"\xc3\x95", 0xD6=>"\xc3\x96", 0xD7=>"\xc3\x97", 0xD8=>"\xc3\x98", 0xD9=>"\xc3\x99", 0xDA=>"\xc3\x9a", 0xDB=>"\xc3\x9b", 0xDC=>"\xc3\x9c", 0xDD=>"\xc3\x9d", 0xDE=>"\xc3\x9e", 0xDF=>"\xc3\x9f", 0xE0=>"\xc3\xa0", 0xE1=>"\xc3\xa1", 0xE2=>"\xc3\xa2", 0xE3=>"\xc3\xa3", 0xE4=>"\xc3\xa4", 0xE5=>"\xc3\xa5", 0xE6=>"\xc3\xa6", 0xE7=>"\xc3\xa7", 0xE8=>"\xc3\xa8", 0xE9=>"\xc3\xa9", 0xEA=>"\xc3\xaa", 0xEB=>"\xc3\xab", 0xEC=>"\xc3\xac", 0xED=>"\xc3\xad", 0xEE=>"\xc3\xae", 0xEF=>"\xc3\xaf", 0xF0=>"\xc3\xb0", 0xF1=>"\xc3\xb1", 0xF2=>"\xc3\xb2", 0xF3=>"\xc3\xb3", 0xF4=>"\xc3\xb4", 0xF5=>"\xc3\xb5", 0xF6=>"\xc3\xb6", 0xF7=>"\xc3\xb7", 0xF8=>"\xc3\xb8", 0xF9=>"\xc3\xb9", 0xFA=>"\xc3\xba", 0xFB=>"\xc3\xbb", 0xFC=>"\xc3\xbc", 0xFD=>"\xc3\xbd", 0xFE=>"\xc3\xbe" ); $win1257_utf8 = array( 0x80=>"\x142\xac", 0x81=>"\xc0\x4", 0x82=>"\x140\x9a", 0x83=>"\xc0\x4", 0x84=>"\x140\x9e", 0x85=>"\x140\xa6", 0x86=>"\x140\xa0", 0x87=>"\x140\xa1", 0x88=>"\xc0\x4", 0x89=>"\x140\xb0", 0x8A=>"\xc0\x4", 0x8B=>"\x140\xb9", 0x8C=>"\xc0\x4", 0x8D=>"\xc2\xa8", 0x8E=>"\xcb\x87", 0x8F=>"\xc2\xb8", 0x90=>"\xc0\x4", 0x91=>"\x140\x98", 0x92=>"\x140\x99", 0x93=>"\x140\x9c", 0x94=>"\x140\x9d", 0x95=>"\x140\xa2", 0x96=>"\x140\x93", 0x97=>"\x140\x94", 0x98=>"\xc0\x4", 0x99=>"\x144\xa2", 0x9A=>"\xc0\x4", 0x9B=>"\x140\xba", 0x9C=>"\xc0\x4", 0x9D=>"\xc2\xaf", 0x9E=>"\xcb\x9b", 0x9F=>"\xc0\x4", 0xA0=>"\xc2\xa0", 0xA1=>"\xc0\x4", 0xA2=>"\xc2\xa2", 0xA3=>"\xc2\xa3", 0xA4=>"\xc2\xa4", 0xA5=>"\xc0\x4", 0xA6=>"\xc2\xa6", 0xA7=>"\xc2\xa7", 0xA8=>"\xc3\x98", 0xA9=>"\xc2\xa9", 0xAA=>"\xc5\x96", 0xAB=>"\xc2\xab", 0xAC=>"\xc2\xac", 0xAD=>"\xc2\xad", 0xAE=>"\xc2\xae", 0xAF=>"\xc3\x86", 0xB0=>"\xc2\xb0", 0xB1=>"\xc2\xb1", 0xB2=>"\xc2\xb2", 0xB3=>"\xc2\xb3", 0xB4=>"\xc2\xb4", 0xB5=>"\xc2\xb5", 0xB6=>"\xc2\xb6", 0xB7=>"\xc2\xb7", 0xB8=>"\xc3\xb8", 0xB9=>"\xc2\xb9", 0xBA=>"\xc5\x97", 0xBB=>"\xc2\xbb", 0xBC=>"\xc2\xbc", 0xBD=>"\xc2\xbd", 0xBE=>"\xc2\xbe", 0xBF=>"\xc3\xa6", 0xC0=>"\xc4\x84", 0xC1=>"\xc4\xae", 0xC2=>"\xc4\x80", 0xC3=>"\xc4\x86", 0xC4=>"\xc3\x84", 0xC5=>"\xc3\x85", 0xC6=>"\xc4\x98", 0xC7=>"\xc4\x92", 0xC8=>"\xc4\x8c", 0xC9=>"\xc3\x89", 0xCA=>"\xc5\xb9", 0xCB=>"\xc4\x96", 0xCC=>"\xc4\xa2", 0xCD=>"\xc4\xb6", 0xCE=>"\xc4\xaa", 0xCF=>"\xc4\xbb", 0xD0=>"\xc5\xa0", 0xD1=>"\xc5\x83", 0xD2=>"\xc5\x85", 0xD3=>"\xc3\x93", 0xD4=>"\xc5\x8c", 0xD5=>"\xc3\x95", 0xD6=>"\xc3\x96", 0xD7=>"\xc3\x97", 0xD8=>"\xc5\xb2", 0xD9=>"\xc5\x81", 0xDA=>"\xc5\x9a", 0xDB=>"\xc5\xaa", 0xDC=>"\xc3\x9c", 0xDD=>"\xc5\xbb", 0xDE=>"\xc5\xbd", 0xDF=>"\xc3\x9f", 0xE0=>"\xc4\x85", 0xE1=>"\xc4\xaf", 0xE2=>"\xc4\x81", 0xE3=>"\xc4\x87", 0xE4=>"\xc3\xa4", 0xE5=>"\xc3\xa5", 0xE6=>"\xc4\x99", 0xE7=>"\xc4\x93", 0xE8=>"\xc4\x8d", 0xE9=>"\xc3\xa9", 0xEA=>"\xc5\xba", 0xEB=>"\xc4\x97", 0xEC=>"\xc4\xa3", 0xED=>"\xc4\xb7", 0xEE=>"\xc4\xab", 0xEF=>"\xc4\xbc", 0xF0=>"\xc5\xa1", 0xF1=>"\xc5\x84", 0xF2=>"\xc5\x86", 0xF3=>"\xc3\xb3", 0xF4=>"\xc5\x8d", 0xF5=>"\xc3\xb5", 0xF6=>"\xc3\xb6", 0xF7=>"\xc3\xb7", 0xF8=>"\xc5\xb3", 0xF9=>"\xc5\x82", 0xFA=>"\xc5\x9b", 0xFB=>"\xc5\xab", 0xFC=>"\xc3\xbc", 0xFD=>"\xc5\xbc", 0xFE=>"\xc5\xbe", 0xFF=>"\xcb\x99" ); /* function to convert to UTF-8 because characters numbered 0-127 are standard ASCII characters and are same in Unicode, we have to recode only higher characters function pass through the string and when it finds such character, it is replaced with UTF-8 two byte representation */ function to_utf8($string,$charset="iso2") { eval("global \$".$charset."_utf8;"); eval("\$coding=\$".$charset."_utf8;"); for ($i=0;$i127) { $string=substr($string,0,$i).$coding[ord($string[$i])].substr($string,++$i); } } return $string; } /* reverse function to convert from UTF-8 and again it pass through the string and when the two following bytes correspond to two byte combination given in translation array, these two characters are replaced with one character from given coding it takes the key returned by array_search() and since the key is the number of specific character, we can use chr() */ function from_utf8($string,$charset="iso2") { eval("global \$".$charset."_utf8;"); eval("\$coding=\$".$charset."_utf8;"); for ($i=0;$i