punycode.php

Non-length-limited punycode en-/decoder
git clone git://git.finwo.net/lib/punycode.php
Log | Files | Refs | README | LICENSE

Punycode.php (9354B)


      1 <?php
      2 
      3 namespace Finwo\Punycode;
      4 
      5 /**
      6  * Class Punycode
      7  *
      8  * Fully static Punycode en-/decoder based on https://www.ietf.org/rfc/rfc3492.txt
      9  * This encoder does not limit string sizes, like https://github.com/true/php-punycode does
     10  *
     11  * @package Finwo\Punycode
     12  */
     13 class Punycode
     14 {
     15     /**
     16      * Bootstring parameter values
     17      *
     18      */
     19     const BASE         = 36;
     20     const DAMP         = 700;
     21     const DELIMITER    = '-';
     22     const INITIAL_BIAS = 72;
     23     const INITIAL_N    = 128;
     24     const PREFIX       = 'xn--';
     25     const SKEW         = 38;
     26     const TMAX         = 26;
     27     const TMIN         = 1;
     28 
     29     /**
     30      * See page 9 of the RFC
     31      *
     32      * @var array
     33      */
     34     protected static $decodeTable = array(
     35         'a' =>  0, 'b' =>  1, 'c' =>  2, 'd' =>  3, 'e' =>  4, 'f' =>  5,
     36         'g' =>  6, 'h' =>  7, 'i' =>  8, 'j' =>  9, 'k' => 10, 'l' => 11,
     37         'm' => 12, 'n' => 13, 'o' => 14, 'p' => 15, 'q' => 16, 'r' => 17,
     38         's' => 18, 't' => 19, 'u' => 20, 'v' => 21, 'w' => 22, 'x' => 23,
     39         'y' => 24, 'z' => 25, '0' => 26, '1' => 27, '2' => 28, '3' => 29,
     40         '4' => 30, '5' => 31, '6' => 32, '7' => 33, '8' => 34, '9' => 35,
     41     );
     42 
     43     /**
     44      * This will be build during __construct
     45      *
     46      * @var array
     47      */
     48     protected static $encodeTable = array();
     49 
     50     /**
     51      * @var bool
     52      */
     53     protected static $initialized = false;
     54 
     55     public static $encoding = 'UTF-8';
     56 
     57     /**
     58      * @return array
     59      */
     60     protected static function buildEncodeTable()
     61     {
     62         if (!count(self::$encodeTable)) {
     63             self::$encodeTable = array_keys(self::$decodeTable);
     64         }
     65 
     66         return self::$encodeTable;
     67     }
     68 
     69     /**
     70      * Initialize the encoder if needed
     71      */
     72     protected static function init()
     73     {
     74         if (self::$initialized) {
     75             return;
     76         }
     77         self::buildEncodeTable();
     78     }
     79 
     80     /**
     81      * List code points for a given input
     82      *
     83      * @param string $input
     84      *
     85      * @return array Multi-dimension array with basic, non-basic and aggregated code points
     86      */
     87     protected static function listCodePoints($input)
     88     {
     89         $codePoints = array(
     90             'all'      => array(),
     91             'basic'    => array(),
     92             'nonBasic' => array(),
     93         );
     94 
     95         $length = mb_strlen($input, self::$encoding);
     96         for ($i = 0; $i < $length; $i++) {
     97             $char = mb_substr($input, $i, 1, self::$encoding);
     98             $code = self::charToCodePoint($char);
     99             if ($code < 128) {
    100                 $codePoints['all'][] = $codePoints['basic'][] = $code;
    101             } else {
    102                 $codePoints['all'][] = $codePoints['nonBasic'][] = $code;
    103             }
    104         }
    105 
    106         return $codePoints;
    107     }
    108 
    109     /**
    110      * Convert a single or multi-byte character to its code point
    111      *
    112      * @param string $char
    113      *
    114      * @return integer
    115      */
    116     protected static function charToCodePoint($char)
    117     {
    118         $code = ord($char[0]);
    119         if ($code < 128) {
    120             return $code;
    121         } elseif ($code < 224) {
    122             return (($code - 192) * 64) + (ord($char[1]) - 128);
    123         } elseif ($code < 240) {
    124             return (($code - 224) * 4096) + ((ord($char[1]) - 128) * 64) + (ord($char[2]) - 128);
    125         } else {
    126             return (($code - 240) * 262144) + ((ord($char[1]) - 128) * 4096) + ((ord($char[2]) - 128) * 64) + (ord($char[3]) - 128);
    127         }
    128     }
    129 
    130     /**
    131      * Convert a code point to its single or multi-byte character
    132      *
    133      * @param integer $code
    134      *
    135      * @return string
    136      */
    137     protected static function codePointToChar($code)
    138     {
    139         if ($code <= 0x7F) {
    140             return chr($code);
    141         } elseif ($code <= 0x7FF) {
    142             return chr(($code >> 6) + 192) . chr(($code & 63) + 128);
    143         } elseif ($code <= 0xFFFF) {
    144             return chr(($code >> 12) + 224) . chr((($code >> 6) & 63) + 128) . chr(($code & 63) + 128);
    145         } else {
    146             return chr(($code >> 18) + 240) . chr((($code >> 12) & 63) + 128) . chr((($code >> 6) & 63) + 128) . chr(($code & 63) + 128);
    147         }
    148     }
    149 
    150     /**
    151      * Calculate the bias threshold to fall between TMIN and TMAX
    152      *
    153      * @param integer $k
    154      * @param integer $bias
    155      *
    156      * @return integer
    157      */
    158     protected static function calculateThreshold($k, $bias)
    159     {
    160         if ($k <= $bias + static::TMIN) {
    161             return static::TMIN;
    162         } elseif ($k >= $bias + static::TMAX) {
    163             return static::TMAX;
    164         }
    165 
    166         return $k - $bias;
    167     }
    168 
    169     /**
    170      * Bias adaptation
    171      *
    172      * @param integer $delta
    173      * @param integer $numPoints
    174      * @param boolean $firstTime
    175      *
    176      * @return integer
    177      */
    178     protected static function adapt($delta, $numPoints, $firstTime)
    179     {
    180         $delta = (int)(
    181         ($firstTime)
    182             ? $delta / static::DAMP
    183             : $delta / 2
    184         );
    185         $delta += (int)($delta / $numPoints);
    186 
    187         $k = 0;
    188         while ($delta > ((static::BASE - static::TMIN) * static::TMAX) / 2) {
    189             $delta = (int)($delta / (static::BASE - static::TMIN));
    190             $k     = $k + static::BASE;
    191         }
    192         $k = $k + (int)(((static::BASE - static::TMIN + 1) * $delta) / ($delta + static::SKEW));
    193 
    194         return $k;
    195     }
    196 
    197     /**
    198      * @param string $input
    199      *
    200      * @return string $encodedString
    201      */
    202     public static function encode($input)
    203     {
    204         self::init();
    205         $codePoints = self::listCodePoints($input);
    206 
    207         $n     = static::INITIAL_N;
    208         $bias  = static::INITIAL_BIAS;
    209         $delta = 0;
    210         $h     = $b = count($codePoints['basic']);
    211 
    212         $output = '';
    213         foreach ($codePoints['basic'] as $code) {
    214             $output .= self::codePointToChar($code);
    215         }
    216         if ($input === $output) {
    217             return $output;
    218         }
    219         if ($b > 0) {
    220             $output .= static::DELIMITER;
    221         }
    222 
    223         $codePoints['nonBasic'] = array_unique($codePoints['nonBasic']);
    224         sort($codePoints['nonBasic']);
    225 
    226         $i      = 0;
    227         $length = mb_strlen($input, self::$encoding);
    228         while ($h < $length) {
    229             $m     = $codePoints['nonBasic'][$i++];
    230             $delta = $delta + ($m - $n) * ($h + 1);
    231             $n     = $m;
    232 
    233             foreach ($codePoints['all'] as $c) {
    234                 if ($c < $n || $c < static::INITIAL_N) {
    235                     $delta++;
    236                 }
    237                 if ($c === $n) {
    238                     $q = $delta;
    239                     for ($k = static::BASE; ; $k += static::BASE) {
    240                         $t = self::calculateThreshold($k, $bias);
    241                         if ($q < $t) {
    242                             break;
    243                         }
    244 
    245                         $code = $t + (($q - $t) % (static::BASE - $t));
    246                         $output .= static::$encodeTable[$code];
    247 
    248                         $q = ($q - $t) / (static::BASE - $t);
    249                     }
    250 
    251                     $output .= static::$encodeTable[$q];
    252                     $bias  = self::adapt($delta, $h + 1, ($h === $b));
    253                     $delta = 0;
    254                     $h++;
    255                 }
    256             }
    257 
    258             $delta++;
    259             $n++;
    260         }
    261         $out = static::PREFIX . $output;
    262 
    263         return $out;
    264     }
    265 
    266     /**
    267      * @param string $encodedString
    268      *
    269      * @return string $decodedString
    270      */
    271     public static function decode($encodedString)
    272     {
    273         self::init();
    274         if (!self::isPunycode($encodedString)) {
    275             return $encodedString;
    276         }
    277         $encodedString = substr($encodedString, strlen(static::PREFIX));
    278         $n             = static::INITIAL_N;
    279         $i             = 0;
    280         $bias          = static::INITIAL_BIAS;
    281         $output        = '';
    282 
    283         $pos = strrpos($encodedString, static::DELIMITER);
    284         if ($pos !== false) {
    285             $output = substr($encodedString, 0, $pos++);
    286         } else {
    287             $pos = 0;
    288         }
    289 
    290         $outputLength = strlen($output);
    291         $inputLength  = strlen($encodedString);
    292         while ($pos < $inputLength) {
    293             $oldi = $i;
    294             $w    = 1;
    295 
    296             for ($k = static::BASE; ; $k += static::BASE) {
    297                 $digit = static::$decodeTable[$encodedString[$pos++]];
    298                 $i     = $i + ($digit * $w);
    299                 $t     = self::calculateThreshold($k, $bias);
    300 
    301                 if ($digit < $t) {
    302                     break;
    303                 }
    304 
    305                 $w = $w * (static::BASE - $t);
    306             }
    307 
    308             $bias   = self::adapt($i - $oldi, ++$outputLength, ($oldi === 0));
    309             $n      = $n + (int)($i / $outputLength);
    310             $i      = $i % ($outputLength);
    311             $output = mb_substr($output, 0, $i, self::$encoding) . self::codePointToChar($n) . mb_substr($output, $i, $outputLength - 1, self::$encoding);
    312 
    313             $i++;
    314         }
    315 
    316         return $output;
    317     }
    318 
    319     /**
    320      * @param string $stringToCheck
    321      *
    322      * @return bool
    323      */
    324     public static function isPunycode($stringToCheck)
    325     {
    326         if (substr($stringToCheck, 0, strlen(static::PREFIX)) != static::PREFIX) {
    327             return false;
    328         }
    329         if (strpos($stringToCheck, static::DELIMITER, strlen(static::PREFIX)) === false) {
    330             return false;
    331         }
    332 
    333         return true;
    334     }
    335 }