Idn.php 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921
  1. <?php
  2. /*
  3. * This file is part of the Symfony package.
  4. *
  5. * (c) Fabien Potencier <fabien@symfony.com> and Trevor Rowbotham <trevor.rowbotham@pm.me>
  6. *
  7. * For the full copyright and license information, please view the LICENSE
  8. * file that was distributed with this source code.
  9. */
  10. namespace Symfony\Polyfill\Intl\Idn;
  11. use Exception;
  12. use Normalizer;
  13. use Symfony\Polyfill\Intl\Idn\Resources\unidata\DisallowedRanges;
  14. use Symfony\Polyfill\Intl\Idn\Resources\unidata\Regex;
  15. /**
  16. * @see https://www.unicode.org/reports/tr46/
  17. *
  18. * @internal
  19. */
  20. final class Idn
  21. {
  22. const ERROR_EMPTY_LABEL = 1;
  23. const ERROR_LABEL_TOO_LONG = 2;
  24. const ERROR_DOMAIN_NAME_TOO_LONG = 4;
  25. const ERROR_LEADING_HYPHEN = 8;
  26. const ERROR_TRAILING_HYPHEN = 0x10;
  27. const ERROR_HYPHEN_3_4 = 0x20;
  28. const ERROR_LEADING_COMBINING_MARK = 0x40;
  29. const ERROR_DISALLOWED = 0x80;
  30. const ERROR_PUNYCODE = 0x100;
  31. const ERROR_LABEL_HAS_DOT = 0x200;
  32. const ERROR_INVALID_ACE_LABEL = 0x400;
  33. const ERROR_BIDI = 0x800;
  34. const ERROR_CONTEXTJ = 0x1000;
  35. const ERROR_CONTEXTO_PUNCTUATION = 0x2000;
  36. const ERROR_CONTEXTO_DIGITS = 0x4000;
  37. const INTL_IDNA_VARIANT_2003 = 0;
  38. const INTL_IDNA_VARIANT_UTS46 = 1;
  39. const MAX_DOMAIN_SIZE = 253;
  40. const MAX_LABEL_SIZE = 63;
  41. const BASE = 36;
  42. const TMIN = 1;
  43. const TMAX = 26;
  44. const SKEW = 38;
  45. const DAMP = 700;
  46. const INITIAL_BIAS = 72;
  47. const INITIAL_N = 128;
  48. const DELIMITER = '-';
  49. const MAX_INT = 2147483647;
  50. /**
  51. * Contains the numeric value of a basic code point (for use in representing integers) in the
  52. * range 0 to BASE-1, or -1 if b is does not represent a value.
  53. *
  54. * @var array<int, int>
  55. */
  56. private static $basicToDigit = array(
  57. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  58. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  59. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  60. 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
  61. -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
  62. 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
  63. -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
  64. 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
  65. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  66. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  67. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  68. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  69. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  70. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  71. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  72. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  73. );
  74. /**
  75. * @var array<int, int>
  76. */
  77. private static $virama;
  78. /**
  79. * @var array<int, string>
  80. */
  81. private static $mapped;
  82. /**
  83. * @var array<int, bool>
  84. */
  85. private static $ignored;
  86. /**
  87. * @var array<int, string>
  88. */
  89. private static $deviation;
  90. /**
  91. * @var array<int, bool>
  92. */
  93. private static $disallowed;
  94. /**
  95. * @var array<int, string>
  96. */
  97. private static $disallowed_STD3_mapped;
  98. /**
  99. * @var array<int, bool>
  100. */
  101. private static $disallowed_STD3_valid;
  102. /**
  103. * @var bool
  104. */
  105. private static $mappingTableLoaded = false;
  106. /**
  107. * @see https://www.unicode.org/reports/tr46/#ToASCII
  108. *
  109. * @param string $domainName
  110. * @param int $options
  111. * @param int $variant
  112. * @param array $idna_info
  113. *
  114. * @return string|false
  115. */
  116. public static function idn_to_ascii($domainName, $options = IDNA_DEFAULT, $variant = self::INTL_IDNA_VARIANT_UTS46, &$idna_info = array())
  117. {
  118. if (\PHP_VERSION_ID >= 70200 && self::INTL_IDNA_VARIANT_2003 === $variant) {
  119. @trigger_error('idn_to_ascii(): INTL_IDNA_VARIANT_2003 is deprecated', E_USER_DEPRECATED);
  120. }
  121. $options = array(
  122. 'CheckHyphens' => true,
  123. 'CheckBidi' => self::INTL_IDNA_VARIANT_2003 === $variant || 0 !== ($options & \IDNA_CHECK_BIDI),
  124. 'CheckJoiners' => self::INTL_IDNA_VARIANT_UTS46 === $variant && 0 !== ($options & \IDNA_CHECK_CONTEXTJ),
  125. 'UseSTD3ASCIIRules' => 0 !== ($options & \IDNA_USE_STD3_RULES),
  126. 'Transitional_Processing' => self::INTL_IDNA_VARIANT_2003 === $variant || 0 === ($options & \IDNA_NONTRANSITIONAL_TO_ASCII),
  127. 'VerifyDnsLength' => true,
  128. );
  129. $info = new Info();
  130. $labels = self::process((string) $domainName, $options, $info);
  131. foreach ($labels as $i => $label) {
  132. // Only convert labels to punycode that contain non-ASCII code points and only if that
  133. // label does not contain a character from the gen-delims set specified in
  134. // {@link https://ietf.org/rfc/rfc3987.html#section-2.2}
  135. if (1 === preg_match('/[^\x00-\x7F]/', $label)) {
  136. if (false !== strpbrk($label, ':/?#[]@')) {
  137. continue;
  138. }
  139. try {
  140. $label = 'xn--'.self::punycodeEncode($label);
  141. } catch (Exception $e) {
  142. $info->errors |= self::ERROR_PUNYCODE;
  143. }
  144. $labels[$i] = $label;
  145. }
  146. }
  147. if ($options['VerifyDnsLength']) {
  148. self::validateDomainAndLabelLength($labels, $info);
  149. }
  150. $idna_info = array(
  151. 'result' => implode('.', $labels),
  152. 'isTransitionalDifferent' => $info->transitionalDifferent,
  153. 'errors' => $info->errors,
  154. );
  155. return 0 === $info->errors ? $idna_info['result'] : false;
  156. }
  157. /**
  158. * @see https://www.unicode.org/reports/tr46/#ToUnicode
  159. *
  160. * @param string $domainName
  161. * @param int $options
  162. * @param int $variant
  163. * @param array $idna_info
  164. *
  165. * @return string|false
  166. */
  167. public static function idn_to_utf8($domainName, $options = IDNA_DEFAULT, $variant = self::INTL_IDNA_VARIANT_UTS46, &$idna_info = array())
  168. {
  169. if (\PHP_VERSION_ID >= 70200 && self::INTL_IDNA_VARIANT_2003 === $variant) {
  170. @trigger_error('idn_to_utf8(): INTL_IDNA_VARIANT_2003 is deprecated', E_USER_DEPRECATED);
  171. }
  172. $info = new Info();
  173. $labels = self::process((string) $domainName, array(
  174. 'CheckHyphens' => true,
  175. 'CheckBidi' => self::INTL_IDNA_VARIANT_2003 === $variant || 0 !== ($options & \IDNA_CHECK_BIDI),
  176. 'CheckJoiners' => self::INTL_IDNA_VARIANT_UTS46 === $variant && 0 !== ($options & \IDNA_CHECK_CONTEXTJ),
  177. 'UseSTD3ASCIIRules' => 0 !== ($options & \IDNA_USE_STD3_RULES),
  178. 'Transitional_Processing' => self::INTL_IDNA_VARIANT_2003 === $variant || 0 === ($options & \IDNA_NONTRANSITIONAL_TO_UNICODE),
  179. ), $info);
  180. $idna_info = array(
  181. 'result' => implode('.', $labels),
  182. 'isTransitionalDifferent' => $info->transitionalDifferent,
  183. 'errors' => $info->errors,
  184. );
  185. return 0 === $info->errors ? $idna_info['result'] : false;
  186. }
  187. /**
  188. * @param string $label
  189. *
  190. * @return bool
  191. */
  192. private static function isValidContextJ(array $codePoints, $label)
  193. {
  194. if (!isset(self::$virama)) {
  195. self::$virama = require __DIR__.\DIRECTORY_SEPARATOR.'Resources'.\DIRECTORY_SEPARATOR.'unidata'.\DIRECTORY_SEPARATOR.'virama.php';
  196. }
  197. $offset = 0;
  198. foreach ($codePoints as $i => $codePoint) {
  199. if (0x200C !== $codePoint && 0x200D !== $codePoint) {
  200. continue;
  201. }
  202. if (!isset($codePoints[$i - 1])) {
  203. return false;
  204. }
  205. // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True;
  206. if (isset(self::$virama[$codePoints[$i - 1]])) {
  207. continue;
  208. }
  209. // If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C(Joining_Type:T)*(Joining_Type:{R,D})) Then
  210. // True;
  211. // Generated RegExp = ([Joining_Type:{L,D}][Joining_Type:T]*\u200C[Joining_Type:T]*)[Joining_Type:{R,D}]
  212. if (0x200C === $codePoint && 1 === preg_match(Regex::ZWNJ, $label, $matches, PREG_OFFSET_CAPTURE, $offset)) {
  213. $offset += \strlen($matches[1][0]);
  214. continue;
  215. }
  216. return false;
  217. }
  218. return true;
  219. }
  220. /**
  221. * @see https://www.unicode.org/reports/tr46/#ProcessingStepMap
  222. *
  223. * @param string $input
  224. * @param array<string, bool> $options
  225. *
  226. * @return string
  227. */
  228. private static function mapCodePoints($input, array $options, Info $info)
  229. {
  230. $str = '';
  231. $useSTD3ASCIIRules = $options['UseSTD3ASCIIRules'];
  232. $transitional = $options['Transitional_Processing'];
  233. foreach (self::utf8Decode($input) as $codePoint) {
  234. $data = self::lookupCodePointStatus($codePoint, $useSTD3ASCIIRules);
  235. switch ($data['status']) {
  236. case 'disallowed':
  237. $info->errors |= self::ERROR_DISALLOWED;
  238. // no break.
  239. case 'valid':
  240. $str .= mb_chr($codePoint, 'utf-8');
  241. break;
  242. case 'ignored':
  243. // Do nothing.
  244. break;
  245. case 'mapped':
  246. $str .= $data['mapping'];
  247. break;
  248. case 'deviation':
  249. $info->transitionalDifferent = true;
  250. $str .= ($transitional ? $data['mapping'] : mb_chr($codePoint, 'utf-8'));
  251. break;
  252. }
  253. }
  254. return $str;
  255. }
  256. /**
  257. * @see https://www.unicode.org/reports/tr46/#Processing
  258. *
  259. * @param string $domain
  260. * @param array<string, bool> $options
  261. *
  262. * @return array<int, string>
  263. */
  264. private static function process($domain, array $options, Info $info)
  265. {
  266. // If VerifyDnsLength is not set, we are doing ToUnicode otherwise we are doing ToASCII and
  267. // we need to respect the VerifyDnsLength option.
  268. $checkForEmptyLabels = !isset($options['VerifyDnsLength']) || $options['VerifyDnsLength'];
  269. if ($checkForEmptyLabels && '' === $domain) {
  270. $info->errors |= self::ERROR_EMPTY_LABEL;
  271. return array($domain);
  272. }
  273. // Step 1. Map each code point in the domain name string
  274. $domain = self::mapCodePoints($domain, $options, $info);
  275. // Step 2. Normalize the domain name string to Unicode Normalization Form C.
  276. if (!Normalizer::isNormalized($domain, Normalizer::FORM_C)) {
  277. $domain = Normalizer::normalize($domain, Normalizer::FORM_C);
  278. }
  279. // Step 3. Break the string into labels at U+002E (.) FULL STOP.
  280. $labels = explode('.', $domain);
  281. $lastLabelIndex = \count($labels) - 1;
  282. // Step 4. Convert and validate each label in the domain name string.
  283. foreach ($labels as $i => $label) {
  284. $validationOptions = $options;
  285. if ('xn--' === substr($label, 0, 4)) {
  286. try {
  287. $label = self::punycodeDecode(substr($label, 4));
  288. } catch (Exception $e) {
  289. $info->errors |= self::ERROR_PUNYCODE;
  290. continue;
  291. }
  292. $validationOptions['Transitional_Processing'] = false;
  293. $labels[$i] = $label;
  294. }
  295. self::validateLabel($label, $info, $validationOptions, $i > 0 && $i === $lastLabelIndex);
  296. }
  297. if ($info->bidiDomain && !$info->validBidiDomain) {
  298. $info->errors |= self::ERROR_BIDI;
  299. }
  300. // Any input domain name string that does not record an error has been successfully
  301. // processed according to this specification. Conversely, if an input domain_name string
  302. // causes an error, then the processing of the input domain_name string fails. Determining
  303. // what to do with error input is up to the caller, and not in the scope of this document.
  304. return $labels;
  305. }
  306. /**
  307. * @see https://tools.ietf.org/html/rfc5893#section-2
  308. *
  309. * @param string $label
  310. */
  311. private static function validateBidiLabel($label, Info $info)
  312. {
  313. if (1 === preg_match(Regex::RTL_LABEL, $label)) {
  314. $info->bidiDomain = true;
  315. // Step 1. The first character must be a character with Bidi property L, R, or AL.
  316. // If it has the R or AL property, it is an RTL label
  317. if (1 !== preg_match(Regex::BIDI_STEP_1_RTL, $label)) {
  318. $info->validBidiDomain = false;
  319. return;
  320. }
  321. // Step 2. In an RTL label, only characters with the Bidi properties R, AL, AN, EN, ES,
  322. // CS, ET, ON, BN, or NSM are allowed.
  323. if (1 === preg_match(Regex::BIDI_STEP_2, $label)) {
  324. $info->validBidiDomain = false;
  325. return;
  326. }
  327. // Step 3. In an RTL label, the end of the label must be a character with Bidi property
  328. // R, AL, EN, or AN, followed by zero or more characters with Bidi property NSM.
  329. if (1 !== preg_match(Regex::BIDI_STEP_3, $label)) {
  330. $info->validBidiDomain = false;
  331. return;
  332. }
  333. // Step 4. In an RTL label, if an EN is present, no AN may be present, and vice versa.
  334. if (1 === preg_match(Regex::BIDI_STEP_4_AN, $label) && 1 === preg_match(Regex::BIDI_STEP_4_EN, $label)) {
  335. $info->validBidiDomain = false;
  336. return;
  337. }
  338. return;
  339. }
  340. // We are a LTR label
  341. // Step 1. The first character must be a character with Bidi property L, R, or AL.
  342. // If it has the L property, it is an LTR label.
  343. if (1 !== preg_match(Regex::BIDI_STEP_1_LTR, $label)) {
  344. $info->validBidiDomain = false;
  345. return;
  346. }
  347. // Step 5. In an LTR label, only characters with the Bidi properties L, EN,
  348. // ES, CS, ET, ON, BN, or NSM are allowed.
  349. if (1 === preg_match(Regex::BIDI_STEP_5, $label)) {
  350. $info->validBidiDomain = false;
  351. return;
  352. }
  353. // Step 6.In an LTR label, the end of the label must be a character with Bidi property L or
  354. // EN, followed by zero or more characters with Bidi property NSM.
  355. if (1 !== preg_match(Regex::BIDI_STEP_6, $label)) {
  356. $info->validBidiDomain = false;
  357. return;
  358. }
  359. }
  360. /**
  361. * @param array<int, string> $labels
  362. */
  363. private static function validateDomainAndLabelLength(array $labels, Info $info)
  364. {
  365. $maxDomainSize = self::MAX_DOMAIN_SIZE;
  366. $length = \count($labels);
  367. // Number of "." delimiters.
  368. $domainLength = $length - 1;
  369. // If the last label is empty and it is not the first label, then it is the root label.
  370. // Increase the max size by 1, making it 254, to account for the root label's "."
  371. // delimiter. This also means we don't need to check the last label's length for being too
  372. // long.
  373. if ($length > 1 && '' === $labels[$length - 1]) {
  374. ++$maxDomainSize;
  375. --$length;
  376. }
  377. for ($i = 0; $i < $length; ++$i) {
  378. $bytes = \strlen($labels[$i]);
  379. $domainLength += $bytes;
  380. if ($bytes > self::MAX_LABEL_SIZE) {
  381. $info->errors |= self::ERROR_LABEL_TOO_LONG;
  382. }
  383. }
  384. if ($domainLength > $maxDomainSize) {
  385. $info->errors |= self::ERROR_DOMAIN_NAME_TOO_LONG;
  386. }
  387. }
  388. /**
  389. * @see https://www.unicode.org/reports/tr46/#Validity_Criteria
  390. *
  391. * @param string $label
  392. * @param array<string, bool> $options
  393. * @param bool $canBeEmpty
  394. */
  395. private static function validateLabel($label, Info $info, array $options, $canBeEmpty)
  396. {
  397. if ('' === $label) {
  398. if (!$canBeEmpty && (!isset($options['VerifyDnsLength']) || $options['VerifyDnsLength'])) {
  399. $info->errors |= self::ERROR_EMPTY_LABEL;
  400. }
  401. return;
  402. }
  403. // Step 1. The label must be in Unicode Normalization Form C.
  404. if (!Normalizer::isNormalized($label, Normalizer::FORM_C)) {
  405. $info->errors |= self::ERROR_INVALID_ACE_LABEL;
  406. }
  407. $codePoints = self::utf8Decode($label);
  408. if ($options['CheckHyphens']) {
  409. // Step 2. If CheckHyphens, the label must not contain a U+002D HYPHEN-MINUS character
  410. // in both the thrid and fourth positions.
  411. if (isset($codePoints[2], $codePoints[3]) && 0x002D === $codePoints[2] && 0x002D === $codePoints[3]) {
  412. $info->errors |= self::ERROR_HYPHEN_3_4;
  413. }
  414. // Step 3. If CheckHyphens, the label must neither begin nor end with a U+002D
  415. // HYPHEN-MINUS character.
  416. if ('-' === substr($label, 0, 1)) {
  417. $info->errors |= self::ERROR_LEADING_HYPHEN;
  418. }
  419. if ('-' === substr($label, -1, 1)) {
  420. $info->errors |= self::ERROR_TRAILING_HYPHEN;
  421. }
  422. }
  423. // Step 4. The label must not contain a U+002E (.) FULL STOP.
  424. if (false !== strpos($label, '.')) {
  425. $info->errors |= self::ERROR_LABEL_HAS_DOT;
  426. }
  427. // Step 5. The label must not begin with a combining mark, that is: General_Category=Mark.
  428. if (1 === preg_match(Regex::COMBINING_MARK, $label)) {
  429. $info->errors |= self::ERROR_LEADING_COMBINING_MARK;
  430. }
  431. // Step 6. Each code point in the label must only have certain status values according to
  432. // Section 5, IDNA Mapping Table:
  433. $transitional = $options['Transitional_Processing'];
  434. $useSTD3ASCIIRules = $options['UseSTD3ASCIIRules'];
  435. foreach ($codePoints as $codePoint) {
  436. $data = self::lookupCodePointStatus($codePoint, $useSTD3ASCIIRules);
  437. $status = $data['status'];
  438. if ('valid' === $status || (!$transitional && 'deviation' === $status)) {
  439. continue;
  440. }
  441. $info->errors |= self::ERROR_DISALLOWED;
  442. break;
  443. }
  444. // Step 7. If CheckJoiners, the label must satisify the ContextJ rules from Appendix A, in
  445. // The Unicode Code Points and Internationalized Domain Names for Applications (IDNA)
  446. // [IDNA2008].
  447. if ($options['CheckJoiners'] && !self::isValidContextJ($codePoints, $label)) {
  448. $info->errors |= self::ERROR_CONTEXTJ;
  449. }
  450. // Step 8. If CheckBidi, and if the domain name is a Bidi domain name, then the label must
  451. // satisfy all six of the numbered conditions in [IDNA2008] RFC 5893, Section 2.
  452. if ($options['CheckBidi'] && (!$info->bidiDomain || $info->validBidiDomain)) {
  453. self::validateBidiLabel($label, $info);
  454. }
  455. }
  456. /**
  457. * @see https://tools.ietf.org/html/rfc3492#section-6.2
  458. *
  459. * @param string $input
  460. *
  461. * @return string
  462. */
  463. private static function punycodeDecode($input)
  464. {
  465. $n = self::INITIAL_N;
  466. $out = 0;
  467. $i = 0;
  468. $bias = self::INITIAL_BIAS;
  469. $lastDelimIndex = strrpos($input, self::DELIMITER);
  470. $b = false === $lastDelimIndex ? 0 : $lastDelimIndex;
  471. $inputLength = \strlen($input);
  472. $output = array();
  473. $bytes = array_map('ord', str_split($input));
  474. for ($j = 0; $j < $b; ++$j) {
  475. if ($bytes[$j] > 0x7F) {
  476. throw new Exception('Invalid input');
  477. }
  478. $output[$out++] = $input[$j];
  479. }
  480. if ($b > 0) {
  481. ++$b;
  482. }
  483. for ($in = $b; $in < $inputLength; ++$out) {
  484. $oldi = $i;
  485. $w = 1;
  486. for ($k = self::BASE; /* no condition */; $k += self::BASE) {
  487. if ($in >= $inputLength) {
  488. throw new Exception('Invalid input');
  489. }
  490. $digit = self::$basicToDigit[$bytes[$in++] & 0xFF];
  491. if ($digit < 0) {
  492. throw new Exception('Invalid input');
  493. }
  494. if ($digit > intdiv(self::MAX_INT - $i, $w)) {
  495. throw new Exception('Integer overflow');
  496. }
  497. $i += $digit * $w;
  498. if ($k <= $bias) {
  499. $t = self::TMIN;
  500. } elseif ($k >= $bias + self::TMAX) {
  501. $t = self::TMAX;
  502. } else {
  503. $t = $k - $bias;
  504. }
  505. if ($digit < $t) {
  506. break;
  507. }
  508. $baseMinusT = self::BASE - $t;
  509. if ($w > intdiv(self::MAX_INT, $baseMinusT)) {
  510. throw new Exception('Integer overflow');
  511. }
  512. $w *= $baseMinusT;
  513. }
  514. $outPlusOne = $out + 1;
  515. $bias = self::adaptBias($i - $oldi, $outPlusOne, 0 === $oldi);
  516. if (intdiv($i, $outPlusOne) > self::MAX_INT - $n) {
  517. throw new Exception('Integer overflow');
  518. }
  519. $n += intdiv($i, $outPlusOne);
  520. $i %= $outPlusOne;
  521. array_splice($output, $i++, 0, array(mb_chr($n, 'utf-8')));
  522. }
  523. return implode('', $output);
  524. }
  525. /**
  526. * @see https://tools.ietf.org/html/rfc3492#section-6.3
  527. *
  528. * @param string $input
  529. *
  530. * @return string
  531. */
  532. private static function punycodeEncode($input)
  533. {
  534. $n = self::INITIAL_N;
  535. $delta = 0;
  536. $out = 0;
  537. $bias = self::INITIAL_BIAS;
  538. $inputLength = 0;
  539. $output = '';
  540. $iter = self::utf8Decode($input);
  541. foreach ($iter as $codePoint) {
  542. ++$inputLength;
  543. if ($codePoint < 0x80) {
  544. $output .= \chr($codePoint);
  545. ++$out;
  546. }
  547. }
  548. $h = $out;
  549. $b = $out;
  550. if ($b > 0) {
  551. $output .= self::DELIMITER;
  552. ++$out;
  553. }
  554. while ($h < $inputLength) {
  555. $m = self::MAX_INT;
  556. foreach ($iter as $codePoint) {
  557. if ($codePoint >= $n && $codePoint < $m) {
  558. $m = $codePoint;
  559. }
  560. }
  561. if ($m - $n > intdiv(self::MAX_INT - $delta, $h + 1)) {
  562. throw new Exception('Integer overflow');
  563. }
  564. $delta += ($m - $n) * ($h + 1);
  565. $n = $m;
  566. foreach ($iter as $codePoint) {
  567. if ($codePoint < $n && 0 === ++$delta) {
  568. throw new Exception('Integer overflow');
  569. } elseif ($codePoint === $n) {
  570. $q = $delta;
  571. for ($k = self::BASE; /* no condition */; $k += self::BASE) {
  572. if ($k <= $bias) {
  573. $t = self::TMIN;
  574. } elseif ($k >= $bias + self::TMAX) {
  575. $t = self::TMAX;
  576. } else {
  577. $t = $k - $bias;
  578. }
  579. if ($q < $t) {
  580. break;
  581. }
  582. $qMinusT = $q - $t;
  583. $baseMinusT = self::BASE - $t;
  584. $output .= self::encodeDigit($t + ($qMinusT) % ($baseMinusT), false);
  585. ++$out;
  586. $q = intdiv($qMinusT, $baseMinusT);
  587. }
  588. $output .= self::encodeDigit($q, false);
  589. ++$out;
  590. $bias = self::adaptBias($delta, $h + 1, $h === $b);
  591. $delta = 0;
  592. ++$h;
  593. }
  594. }
  595. ++$delta;
  596. ++$n;
  597. }
  598. return $output;
  599. }
  600. /**
  601. * @see https://tools.ietf.org/html/rfc3492#section-6.1
  602. *
  603. * @param int $delta
  604. * @param int $numPoints
  605. * @param bool $firstTime
  606. *
  607. * @return int
  608. */
  609. private static function adaptBias($delta, $numPoints, $firstTime)
  610. {
  611. // xxx >> 1 is a faster way of doing intdiv(xxx, 2)
  612. $delta = $firstTime ? intdiv($delta, self::DAMP) : $delta >> 1;
  613. $delta += intdiv($delta, $numPoints);
  614. $k = 0;
  615. while ($delta > ((self::BASE - self::TMIN) * self::TMAX) >> 1) {
  616. $delta = intdiv($delta, self::BASE - self::TMIN);
  617. $k += self::BASE;
  618. }
  619. return $k + intdiv((self::BASE - self::TMIN + 1) * $delta, $delta + self::SKEW);
  620. }
  621. /**
  622. * @param int $d
  623. * @param bool $flag
  624. *
  625. * @return string
  626. */
  627. private static function encodeDigit($d, $flag)
  628. {
  629. return \chr($d + 22 + 75 * ($d < 26 ? 1 : 0) - (($flag ? 1 : 0) << 5));
  630. }
  631. /**
  632. * Takes a UTF-8 encoded string and converts it into a series of integer code points. Any
  633. * invalid byte sequences will be replaced by a U+FFFD replacement code point.
  634. *
  635. * @see https://encoding.spec.whatwg.org/#utf-8-decoder
  636. *
  637. * @param string $input
  638. *
  639. * @return array<int, int>
  640. */
  641. private static function utf8Decode($input)
  642. {
  643. $bytesSeen = 0;
  644. $bytesNeeded = 0;
  645. $lowerBoundary = 0x80;
  646. $upperBoundary = 0xBF;
  647. $codePoint = 0;
  648. $codePoints = array();
  649. $length = \strlen($input);
  650. for ($i = 0; $i < $length; ++$i) {
  651. $byte = \ord($input[$i]);
  652. if (0 === $bytesNeeded) {
  653. if ($byte >= 0x00 && $byte <= 0x7F) {
  654. $codePoints[] = $byte;
  655. continue;
  656. }
  657. if ($byte >= 0xC2 && $byte <= 0xDF) {
  658. $bytesNeeded = 1;
  659. $codePoint = $byte & 0x1F;
  660. } elseif ($byte >= 0xE0 && $byte <= 0xEF) {
  661. if (0xE0 === $byte) {
  662. $lowerBoundary = 0xA0;
  663. } elseif (0xED === $byte) {
  664. $upperBoundary = 0x9F;
  665. }
  666. $bytesNeeded = 2;
  667. $codePoint = $byte & 0xF;
  668. } elseif ($byte >= 0xF0 && $byte <= 0xF4) {
  669. if (0xF0 === $byte) {
  670. $lowerBoundary = 0x90;
  671. } elseif (0xF4 === $byte) {
  672. $upperBoundary = 0x8F;
  673. }
  674. $bytesNeeded = 3;
  675. $codePoint = $byte & 0x7;
  676. } else {
  677. $codePoints[] = 0xFFFD;
  678. }
  679. continue;
  680. }
  681. if ($byte < $lowerBoundary || $byte > $upperBoundary) {
  682. $codePoint = 0;
  683. $bytesNeeded = 0;
  684. $bytesSeen = 0;
  685. $lowerBoundary = 0x80;
  686. $upperBoundary = 0xBF;
  687. --$i;
  688. $codePoints[] = 0xFFFD;
  689. continue;
  690. }
  691. $lowerBoundary = 0x80;
  692. $upperBoundary = 0xBF;
  693. $codePoint = ($codePoint << 6) | ($byte & 0x3F);
  694. if (++$bytesSeen !== $bytesNeeded) {
  695. continue;
  696. }
  697. $codePoints[] = $codePoint;
  698. $codePoint = 0;
  699. $bytesNeeded = 0;
  700. $bytesSeen = 0;
  701. }
  702. // String unexpectedly ended, so append a U+FFFD code point.
  703. if (0 !== $bytesNeeded) {
  704. $codePoints[] = 0xFFFD;
  705. }
  706. return $codePoints;
  707. }
  708. /**
  709. * @param int $codePoint
  710. * @param bool $useSTD3ASCIIRules
  711. *
  712. * @return array{status: string, mapping?: string}
  713. */
  714. private static function lookupCodePointStatus($codePoint, $useSTD3ASCIIRules)
  715. {
  716. if (!self::$mappingTableLoaded) {
  717. self::$mappingTableLoaded = true;
  718. self::$mapped = require __DIR__.'/Resources/unidata/mapped.php';
  719. self::$ignored = require __DIR__.'/Resources/unidata/ignored.php';
  720. self::$deviation = require __DIR__.'/Resources/unidata/deviation.php';
  721. self::$disallowed = require __DIR__.'/Resources/unidata/disallowed.php';
  722. self::$disallowed_STD3_mapped = require __DIR__.'/Resources/unidata/disallowed_STD3_mapped.php';
  723. self::$disallowed_STD3_valid = require __DIR__.'/Resources/unidata/disallowed_STD3_valid.php';
  724. }
  725. if (isset(self::$mapped[$codePoint])) {
  726. return array('status' => 'mapped', 'mapping' => self::$mapped[$codePoint]);
  727. }
  728. if (isset(self::$ignored[$codePoint])) {
  729. return array('status' => 'ignored');
  730. }
  731. if (isset(self::$deviation[$codePoint])) {
  732. return array('status' => 'deviation', 'mapping' => self::$deviation[$codePoint]);
  733. }
  734. if (isset(self::$disallowed[$codePoint]) || DisallowedRanges::inRange($codePoint)) {
  735. return array('status' => 'disallowed');
  736. }
  737. $isDisallowedMapped = isset(self::$disallowed_STD3_mapped[$codePoint]);
  738. if ($isDisallowedMapped || isset(self::$disallowed_STD3_valid[$codePoint])) {
  739. $status = 'disallowed';
  740. if (!$useSTD3ASCIIRules) {
  741. $status = $isDisallowedMapped ? 'mapped' : 'valid';
  742. }
  743. if ($isDisallowedMapped) {
  744. return array('status' => $status, 'mapping' => self::$disallowed_STD3_mapped[$codePoint]);
  745. }
  746. return array('status' => $status);
  747. }
  748. return array('status' => 'valid');
  749. }
  750. }