Blame view

libraries/vendor/joomla/string/src/phputf8/utils/position.php 4.99 KB
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
<?php
/**
* Locate a byte index given a UTF-8 character index
* @package utf8
*/

//--------------------------------------------------------------------
/**
* Given a string and a character index in the string, in
* terms of the UTF-8 character position, returns the byte
* index of that character. Can be useful when you want to
* PHP's native string functions but we warned, locating
* the byte can be expensive
* Takes variable number of parameters - first must be
* the search string then 1 to n UTF-8 character positions
* to obtain byte indexes for - it is more efficient to search
* the string for multiple characters at once, than make
* repeated calls to this function
*
* @author Chris Smith<chris@jalakai.co.uk>
* @param string string to locate index in
* @param int (n times)
* @return mixed - int if only one input int, array if more
* @return boolean TRUE if it's all ASCII
* @package utf8
*/
function utf8_byte_position() {

    $args = func_get_args();
    $str =& array_shift($args);
    if (!is_string($str)) return false;

    $result = array();

    // trivial byte index, character offset pair
    $prev = array(0,0);

    // use a short piece of str to estimate bytes per character
    // $i (& $j) -> byte indexes into $str
    $i = utf8_locate_next_chr($str, 300);

    // $c -> character offset into $str
    $c = strlen(utf8_decode(substr($str,0,$i)));

    // deal with arguments from lowest to highest
    sort($args);

    foreach ($args as $offset) {
        // sanity checks FIXME

        // 0 is an easy check
        if ($offset == 0) { $result[] = 0; continue; }

        // ensure no endless looping
        $safety_valve = 50;

        do {

            if ( ($c - $prev[1]) == 0 ) {
                // Hack: gone past end of string
                $error = 0;
                $i = strlen($str);
                break;
            }

            $j = $i + (int)(($offset-$c) * ($i - $prev[0]) / ($c - $prev[1]));

            // correct to utf8 character boundary
            $j = utf8_locate_next_chr($str, $j);

            // save the index, offset for use next iteration
            $prev = array($i,$c);

            if ($j > $i) {
                // determine new character offset
                $c += strlen(utf8_decode(substr($str,$i,$j-$i)));
            } else {
                // ditto
                $c -= strlen(utf8_decode(substr($str,$j,$i-$j)));
            }

            $error = abs($c-$offset);

            // ready for next time around
            $i = $j;

        // from 7 it is faster to iterate over the string
        } while ( ($error > 7) && --$safety_valve) ;

        if ($error && $error <= 7) {

            if ($c < $offset) {
                // move up
                while ($error--) { $i = utf8_locate_next_chr($str,++$i); }
            } else {
                // move down
                while ($error--) { $i = utf8_locate_current_chr($str,--$i); }
            }

            // ready for next arg
            $c = $offset;
        }
        $result[] = $i;
    }

    if ( count($result) == 1 ) {
        return $result[0];
    }

    return $result;
}

//--------------------------------------------------------------------
/**
* Given a string and any byte index, returns the byte index
* of the start of the current UTF-8 character, relative to supplied
* position. If the current character begins at the same place as the
* supplied byte index, that byte index will be returned. Otherwise
* this function will step backwards, looking for the index where
* curent UTF-8 character begins
* @author Chris Smith<chris@jalakai.co.uk>
* @param string
* @param int byte index in the string
* @return int byte index of start of next UTF-8 character
* @package utf8
*/
function utf8_locate_current_chr( &$str, $idx ) {

    if ($idx <= 0) return 0;

    $limit = strlen($str);
    if ($idx >= $limit) return $limit;

    // Binary value for any byte after the first in a multi-byte UTF-8 character
    // will be like 10xxxxxx so & 0xC0 can be used to detect this kind
    // of byte - assuming well formed UTF-8
    while ($idx && ((ord($str[$idx]) & 0xC0) == 0x80)) $idx--;

    return $idx;
}

//--------------------------------------------------------------------
/**
* Given a string and any byte index, returns the byte index
* of the start of the next UTF-8 character, relative to supplied
* position. If the next character begins at the same place as the
* supplied byte index, that byte index will be returned.
* @author Chris Smith<chris@jalakai.co.uk>
* @param string
* @param int byte index in the string
* @return int byte index of start of next UTF-8 character
* @package utf8
*/
function utf8_locate_next_chr( &$str, $idx ) {

    if ($idx <= 0) return 0;

    $limit = strlen($str);
    if ($idx >= $limit) return $limit;

    // Binary value for any byte after the first in a multi-byte UTF-8 character
    // will be like 10xxxxxx so & 0xC0 can be used to detect this kind
    // of byte - assuming well formed UTF-8
    while (($idx < $limit) && ((ord($str[$idx]) & 0xC0) == 0x80)) $idx++;

    return $idx;
}