Blame view

libraries/idna_convert/uctc.php 11 KB
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301
<?php
/**
 * UCTC - The Unicode Transcoder
 *
 * Converts between various flavours of Unicode representations like UCS-4 or UTF-8
 * Supported schemes:
 * - UCS-4 Little Endian / Big Endian / Array (partially)
 * - UTF-16 Little Endian / Big Endian (not yet)
 * - UTF-8
 * - UTF-7
 * - UTF-7 IMAP (modified UTF-7)
 *
 * @package phlyMail Nahariya 4.0+ Default branch
 * @author Matthias Sommerfeld  <mso@phlyLabs.de>
 * @copyright 2003-2009 phlyLabs Berlin, http://phlylabs.de
 * @version 0.0.6 2009-05-10
 * @note This file has been modified by the Joomla! Project and no longer reflects the original work of its author.
 */
class uctc {
    private static $mechs = array('ucs4', /*'ucs4le', 'ucs4be', */'ucs4array', /*'utf16', 'utf16le', 'utf16be', */'utf8', 'utf7', 'utf7imap');
    private static $allow_overlong = false;
    private static $safe_mode;
    private static $safe_char;

    /**
     * The actual conversion routine
     *
     * @param mixed $data  The data to convert, usually a string, array when converting from UCS-4 array
     * @param string $from  Original encoding of the data
     * @param string $to  Target encoding of the data
     * @param bool $safe_mode  SafeMode tries to correct invalid codepoints
     * @return mixed  False on failure, String or array on success, depending on target encoding
     * @access public
     * @since 0.0.1
     */
    public static function convert($data, $from, $to, $safe_mode = false, $safe_char = 0xFFFC)
    {
        self::$safe_mode = ($safe_mode) ? true : false;
        self::$safe_char = ($safe_char) ? $safe_char : 0xFFFC;
        if (self::$safe_mode) self::$allow_overlong = true;
        if (!in_array($from, self::$mechs)) throw new Exception('Invalid input format specified');
        if (!in_array($to, self::$mechs)) throw new Exception('Invalid output format specified');
        if ($from != 'ucs4array') eval('$data = self::'.$from.'_ucs4array($data);');
        if ($to != 'ucs4array') eval('$data = self::ucs4array_'.$to.'($data);');
        return $data;
    }

    /**
     * This converts an UTF-8 encoded string to its UCS-4 representation
     *
     * @param string $input  The UTF-8 string to convert
     * @return array  Array of 32bit values representing each codepoint
     * @access private
     */
    private static function utf8_ucs4array($input)
    {
        $output = array();
        $out_len = 0;
        $inp_len = strlen($input);
        $mode = 'next';
        $test = 'none';
        for ($k = 0; $k < $inp_len; ++$k) {
            $v = ord($input[$k]); // Extract byte from input string

            if ($v < 128) { // We found an ASCII char - put into stirng as is
                $output[$out_len] = $v;
                ++$out_len;
                if ('add' == $mode) {
                    if (self::$safe_mode) {
                        $output[$out_len-2] = self::$safe_char;
                        $mode = 'next';
                    } else {
                        throw new Exception('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
                    }
                }
                continue;
            }
            if ('next' == $mode) { // Try to find the next start byte; determine the width of the Unicode char
                $start_byte = $v;
                $mode = 'add';
                $test = 'range';
                if ($v >> 5 == 6) { // &110xxxxx 10xxxxx
                    $next_byte = 0; // Tells, how many times subsequent bitmasks must rotate 6bits to the left
                    $v = ($v - 192) << 6;
                } elseif ($v >> 4 == 14) { // &1110xxxx 10xxxxxx 10xxxxxx
                    $next_byte = 1;
                    $v = ($v - 224) << 12;
                } elseif ($v >> 3 == 30) { // &11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
                    $next_byte = 2;
                    $v = ($v - 240) << 18;
                } elseif (self::$safe_mode) {
                    $mode = 'next';
                    $output[$out_len] = self::$safe_char;
                    ++$out_len;
                    continue;
                } else {
                    throw new Exception('This might be UTF-8, but I don\'t understand it at byte '.$k);
                }
                if ($inp_len-$k-$next_byte < 2) {
                    $output[$out_len] = self::$safe_char;
                    $mode = 'no';
                    continue;
                }

                if ('add' == $mode) {
                    $output[$out_len] = (int) $v;
                    ++$out_len;
                    continue;
                }
            }
            if ('add' == $mode) {
                if (!self::$allow_overlong && $test == 'range') {
                    $test = 'none';
                    if (($v < 0xA0 && $start_byte == 0xE0) || ($v < 0x90 && $start_byte == 0xF0) || ($v > 0x8F && $start_byte == 0xF4)) {
                        throw new Exception('Bogus UTF-8 character detected (out of legal range) at byte '.$k);
                    }
                }
                if ($v >> 6 == 2) { // Bit mask must be 10xxxxxx
                    $v = ($v-128) << ($next_byte*6);
                    $output[($out_len-1)] += $v;
                    --$next_byte;
                } else {
                    if (self::$safe_mode) {
                        $output[$out_len-1] = ord(self::$safe_char);
                        $k--;
                        $mode = 'next';
                        continue;
                    } else {
                        throw new Exception('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
                    }
                }
                if ($next_byte < 0) {
                    $mode = 'next';
                }
            }
        } // for
        return $output;
    }

    /**
     * Convert UCS-4 string into UTF-8 string
     * See utf8_ucs4array() for details
     * @access   private
     */
    private static function ucs4array_utf8($input)
    {
        $output = '';
        foreach ($input as $v) {
            if ($v < 128) { // 7bit are transferred literally
                $output .= chr($v);
            } elseif ($v < (1 << 11)) { // 2 bytes
                $output .= chr(192+($v >> 6)).chr(128+($v & 63));
            } elseif ($v < (1 << 16)) { // 3 bytes
                $output .= chr(224+($v >> 12)).chr(128+(($v >> 6) & 63)).chr(128+($v & 63));
            } elseif ($v < (1 << 21)) { // 4 bytes
                $output .= chr(240+($v >> 18)).chr(128+(($v >> 12) & 63)).chr(128+(($v >> 6) & 63)).chr(128+($v & 63));
            } elseif (self::$safe_mode) {
                $output .= self::$safe_char;
            } else {
                throw new Exception('Conversion from UCS-4 to UTF-8 failed: malformed input at byte '.$k);
            }
        }
        return $output;
    }

    private static function utf7imap_ucs4array($input)
    {
        return self::utf7_ucs4array(str_replace(',', '/', $input), '&');
    }

    private static function utf7_ucs4array($input, $sc = '+')
    {
        $output  = array();
        $out_len = 0;
        $inp_len = strlen($input);
        $mode    = 'd';
        $b64     = '';

        for ($k = 0; $k < $inp_len; ++$k) {
            $c = $input[$k];
            if (0 == ord($c)) continue; // Ignore zero bytes
            if ('b' == $mode) {
                // Sequence got terminated
                if (!preg_match('![A-Za-z0-9/'.preg_quote($sc, '!').']!', $c)) {
                    if ('-' == $c) {
                        if ($b64 == '') {
                            $output[$out_len] = ord($sc);
                            $out_len++;
                            $mode = 'd';
                            continue;
                        }
                    }
                    $tmp = base64_decode($b64);
                    $tmp = substr($tmp, -1 * (strlen($tmp) % 2));
                    for ($i = 0; $i < strlen($tmp); $i++) {
                        if ($i % 2) {
                            $output[$out_len] += ord($tmp[$i]);
                            $out_len++;
                        } else {
                            $output[$out_len] = ord($tmp[$i]) << 8;
                        }
                    }
                    $mode = 'd';
                    $b64 = '';
                    continue;
                } else {
                    $b64 .= $c;
                }
            }
            if ('d' == $mode) {
                if ($sc == $c) {
                    $mode = 'b';
                    continue;
                }
                $output[$out_len] = ord($c);
                $out_len++;
            }
        }
        return $output;
    }

    private static function ucs4array_utf7imap($input)
    {
        return str_replace('/', ',', self::ucs4array_utf7($input, '&'));
    }

    private static function ucs4array_utf7($input, $sc = '+')
    {
        $output = '';
        $mode = 'd';
        $b64 = '';
        while (true) {
            $v = (!empty($input)) ? array_shift($input) : false;
            $is_direct = (false !== $v) ? (0x20 <= $v && $v <= 0x7e && $v != ord($sc)) : true;
            if ($mode == 'b') {
                if ($is_direct) {
                    if ($b64 == chr(0).$sc) {
                        $output .= $sc.'-';
                        $b64 = '';
                    } elseif ($b64) {
                        $output .= $sc.str_replace('=', '', base64_encode($b64)).'-';
                        $b64 = '';
                    }
                    $mode = 'd';
                } elseif (false !== $v) {
                    $b64 .= chr(($v >> 8) & 255). chr($v & 255);
                }
            }
            if ($mode == 'd' && false !== $v) {
                if ($is_direct) {
                    $output .= chr($v);
                } else {
                    $b64 = chr(($v >> 8) & 255). chr($v & 255);
                    $mode = 'b';
                }
            }
            if (false === $v && $b64 == '') break;
        }
        return $output;
    }

    /**
     * Convert UCS-4 array into UCS-4 string (Little Endian at the moment)
     * @access   private
     */
    private static function ucs4array_ucs4($input)
    {
        $output = '';
        foreach ($input as $v) {
            $output .= chr(($v >> 24) & 255).chr(($v >> 16) & 255).chr(($v >> 8) & 255).chr($v & 255);
        }
        return $output;
    }

    /**
     * Convert UCS-4 string (LE in the moment) into UCS-4 garray
     * @access   private
     */
    private static function ucs4_ucs4array($input)
    {
        $output = array();

        $inp_len = strlen($input);
        // Input length must be dividable by 4
        if ($inp_len % 4) {
            throw new Exception('Input UCS4 string is broken');
        }
        // Empty input - return empty output
        if (!$inp_len) return $output;

        for ($i = 0, $out_len = -1; $i < $inp_len; ++$i) {
            if (!($i % 4)) { // Increment output position every 4 input bytes
                $out_len++;
                $output[$out_len] = 0;
            }
            $output[$out_len] += ord($input[$i]) << (8 * (3 - ($i % 4) ) );
        }
        return $output;
    }
}
?>