getList(); ksort($countries); $serviceUrl = 'https://chromium-i18n.appspot.com/ssl-address'; // Make sure we're starting from a clean slate. if (is_dir(__DIR__ . '/subdivision')) { die('The subdivision/ directory must not exist.'); } // Prepare the filesystem. mkdir(__DIR__ . '/subdivision'); // Create a list of countries for which Google has definitions. $foundCountries = ['ZZ']; $index = file_get_contents($serviceUrl); foreach ($countries as $countryCode => $countryName) { $link = ""; // This is still faster than running a file_exists() for each country code. if (strpos($index, $link) !== false) { $foundCountries[] = $countryCode; } } echo "Converting the raw definitions into the expected format.\n"; $genericDefinition = null; $addressFormats = []; $groupedSubdivisions = []; foreach ($foundCountries as $countryCode) { $definition = file_get_contents(__DIR__ . '/assets/google/' . $countryCode . '.json'); $definition = json_decode($definition, true); $extraKeys = array_diff(array_keys($definition), ['id', 'key', 'name']); if (empty($extraKeys)) { // This is an empty definition, skip it. continue; } if ($countryCode == 'MO') { // Fix for Macao, which has latin and non-latin formats, but no lang. $definition['lang'] = 'zh'; } $addressFormat = create_address_format_definition($countryCode, $definition); // Get the French subdivision names for Canada. // This mechanism can only work for countries with a single // alternative language and ISO-based subdivision codes // (URL example: data/CA/AB and data/CA/AB--fr). $languages = []; if ($countryCode == 'CA' && isset($definition['languages'])) { $languages = explode('~', $definition['languages']); array_shift($languages); } $subdivisionPaths = []; if (isset($definition['sub_keys'])) { $subdivisionKeys = explode('~', $definition['sub_keys']); foreach ($subdivisionKeys as $subdivisionKey) { $subdivisionPaths[] = $countryCode . '_' . $subdivisionKey; } } $groupedSubdivisions += generate_subdivisions($countryCode, [$countryCode], $subdivisionPaths, $languages); $addressFormats[$countryCode] = $addressFormat; } echo "Writing the final definitions to disk.\n"; // Subdivisions are stored in JSON. foreach ($groupedSubdivisions as $parentId => $subdivisions) { file_put_json(__DIR__ . '/subdivision/' . $parentId . '.json', $subdivisions); } // Replace subdivision/ES.json with the old resources/subdivision/ES.json, to // get around a dataset regression (https://github.com/googlei18n/libaddressinput/issues/160). copy(__DIR__ . '/../resources/subdivision/ES.json', __DIR__ . '/subdivision/ES.json'); // Generate the subdivision depths for each country. $depths = generate_subdivision_depths($foundCountries); foreach ($depths as $countryCode => $depth) { $addressFormats[$countryCode]['subdivision_depth'] = $depth; } // Address formats are stored in PHP, then manually transferred to // AddressFormatRepository. file_put_php(__DIR__ . '/address_formats.php', $addressFormats); echo "Done.\n"; /** * Converts the provided data into json and writes it to the disk. */ function file_put_json($filename, $data) { $data = json_encode($data, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE); // Indenting with tabs instead of 4 spaces gives us 20% smaller files. $data = str_replace(' ', "\t", $data); file_put_contents($filename, $data); } /** * Converts the provided data into php and writes it to the disk. */ function file_put_php($filename, $data) { $data = var_export($data, true) . ';'; // The var_export output is terrible, so try to get it as close as possible // to the final result. $array_keys = [ '0 => ', '1 => ', '2 => ', '3 => ', '4 => ', '5 => ', '6 => ', '7 => ', '8 => ', '9 => ', '10 => ', '11 => ', ]; $data = str_replace(['array (', "),\n", ');', "=> \n "], ['[', "],\n", '];', '=> '], $data); $data = str_replace('=> [', '=> [', $data); $data = str_replace($array_keys, '', $data); // Put fields into one row. $find = []; $replace = []; foreach (AddressField::getAll() as $field) { $find[] = "'$field',\n '"; $replace[] = "'$field', '"; } $data = str_replace($find, $replace, $data); // Replace format single quotes with double quotes, to parse \n properly. $data = str_replace(["format' => '", ";;;'"], ['format\' => "', '"'], $data); // Reindent (from 2 to 4 spaces). $data = str_replace(' ', ' ', $data); // Unescape backslashes. $data = str_replace('\\\\', '\\', $data); $data = ' $countryCode, ]; if (count($parents) > 1) { // A single parent is the same as the country code, hence unnecessary. $subdivisions[$group]['parents'] = $parents; } foreach ($subdivisionPaths as $subdivisionPath) { $definition = file_get_contents(__DIR__ . '/assets/google/' . $subdivisionPath . '.json'); $definition = json_decode($definition, true); // The lname is usable as a latin code when the key is non-latin. $code = $definition['key']; if (isset($definition['lname'])) { $code = $definition['lname']; } if (empty($subdivisions[$group]['locale']) && isset($definition['lang'], $definition['lname'])) { // Only add the locale if there's a local name. $subdivisions[$group]['locale'] = process_locale($definition['lang']); } // (Ab)use the local_name field to hold latin translations. This allows // us to support only a single translation, but since our only example // here is Canada (with French), it will do. $translationLanguage = reset($languages); if ($translationLanguage) { $translation = file_get_contents(__DIR__ . '/assets/google/' . $subdivisionPath . '--' . $translationLanguage . '.json'); $translation = json_decode($translation, true); $subdivisions[$group]['locale'] = Locale::canonicalize($translationLanguage); $definition['lname'] = $definition['name']; $definition['name'] = $translation['name']; } // Remove the locale key if it wasn't filled. if (empty($subdivisions[$group]['locale'])) { unset($subdivisions[$group]['locale']); } // Generate the subdivision. $subdivisions[$group]['subdivisions'][$code] = create_subdivision_definition($countryCode, $code, $definition); if (isset($definition['sub_keys'])) { $subdivisions[$group]['subdivisions'][$code]['has_children'] = true; $subdivisionChildrenPaths = []; $subdivisionChildrenKeys = explode('~', $definition['sub_keys']); foreach ($subdivisionChildrenKeys as $subdivisionChildrenKey) { $subdivisionChildrenPaths[] = $subdivisionPath . '_' . $subdivisionChildrenKey; } $childParents = array_merge($parents, [$code]); $subdivisions += generate_subdivisions($countryCode, $childParents, $subdivisionChildrenPaths, $languages); } } // Apply any found customizations. $customizations = get_subdivision_customizations($group); $subdivisions[$group] = apply_subdivision_customizations($subdivisions[$group], $customizations); return !empty($subdivisions[$group]['subdivisions']) ? $subdivisions : []; } /** * Generates the subdivision depths for each country. */ function generate_subdivision_depths($countries) { $depths = []; foreach ($countries as $countryCode) { $patterns = [ __DIR__ . '/subdivision/' . $countryCode . '.json', __DIR__ . '/subdivision/' . $countryCode . '-*.json', __DIR__ . '/subdivision/' . $countryCode . '--*.json', ]; foreach ($patterns as $pattern) { if (glob($pattern)) { $previous = isset($depths[$countryCode]) ? $depths[$countryCode] : 0; $depths[$countryCode] = $previous + 1; } else { break; } } } return $depths; } /** * Creates an address format definition from Google's raw definition. */ function create_address_format_definition($countryCode, $rawDefinition) { // Avoid notices. $rawDefinition += [ 'lang' => null, 'fmt' => null, 'require' => null, 'upper' => null, 'state_name_type' => null, 'locality_name_type' => null, 'sublocality_name_type' => null, 'zip_name_type' => null, ]; // ZZ holds the defaults for all address formats, and these are missing. if ($countryCode == 'ZZ') { $rawDefinition['state_name_type'] = AdministrativeAreaType::getDefault(); $rawDefinition['sublocality_name_type'] = DependentLocalityType::getDefault(); $rawDefinition['zip_name_type'] = PostalCodeType::getDefault(); } $addressFormat = [ 'locale' => process_locale($rawDefinition['lang']), 'format' => null, 'local_format' => null, 'required_fields' => convert_fields($rawDefinition['require'], 'required'), 'uppercase_fields' => convert_fields($rawDefinition['upper'], 'uppercase'), ]; if (isset($rawDefinition['lfmt']) && $rawDefinition['lfmt'] != $rawDefinition['fmt']) { $addressFormat['format'] = convert_format($countryCode, $rawDefinition['lfmt']); $addressFormat['local_format'] = convert_format($countryCode, $rawDefinition['fmt']); } else { $addressFormat['format'] = convert_format($countryCode, $rawDefinition['fmt']); // We don't need the locale if there's no local format. unset($addressFormat['locale']); } $addressFormat['administrative_area_type'] = $rawDefinition['state_name_type']; $addressFormat['locality_type'] = $rawDefinition['locality_name_type']; $addressFormat['dependent_locality_type'] = $rawDefinition['sublocality_name_type']; $addressFormat['postal_code_type'] = $rawDefinition['zip_name_type']; if (isset($rawDefinition['zip'])) { $addressFormat['postal_code_pattern'] = $rawDefinition['zip']; } if (isset($rawDefinition['postprefix'])) { // Workaround for https://github.com/googlei18n/libaddressinput/issues/72. if ($rawDefinition['postprefix'] == 'PR') { $rawDefinition['postprefix'] = 'PR '; } $addressFormat['postal_code_prefix'] = $rawDefinition['postprefix']; // Remove the prefix from the format strings. // Workaround for https://github.com/googlei18n/libaddressinput/issues/71. $addressFormat['format'] = str_replace($addressFormat['postal_code_prefix'], '', $addressFormat['format']); $addressFormat['local_format'] = str_replace($addressFormat['postal_code_prefix'], '', $addressFormat['local_format']); } // Add the subdivision_depth to the end of the ZZ definition. if ($countryCode == 'ZZ') { $addressFormat['subdivision_depth'] = 0; } // Remove multiple spaces in the formats. if (!empty($addressFormat['format'])) { $addressFormat['format'] = preg_replace('/[[:blank:]]+/', ' ', $addressFormat['format']); } if (!empty($addressFormat['local_format'])) { $addressFormat['local_format'] = preg_replace('/[[:blank:]]+/', ' ', $addressFormat['local_format']); } // Apply any customizations. $customizations = get_address_format_customizations($countryCode); foreach ($customizations as $key => $values) { $addressFormat[$key] = $values; } // Denote the end of the format string for file_put_php(). if (!empty($addressFormat['format'])) { $addressFormat['format'] .= ';;;'; } if (!empty($addressFormat['local_format'])) { $addressFormat['local_format'] .= ';;;'; } // Remove NULL keys. $addressFormat = array_filter($addressFormat, function ($value) { return !is_null($value); }); // Remove empty local formats. if (empty($addressFormat['local_format'])) { unset($addressFormat['local_format']); } return $addressFormat; } /** * Creates a subdivision definition from Google's raw definition. */ function create_subdivision_definition($countryCode, $code, $rawDefinition) { $subdivision = []; if (isset($rawDefinition['lname'])) { $subdivision['local_code'] = $rawDefinition['key']; if (isset($rawDefinition['name']) && $rawDefinition['key'] != $rawDefinition['name']) { $subdivision['local_name'] = $rawDefinition['name']; } if ($code != $rawDefinition['lname']) { $subdivision['name'] = $rawDefinition['lname']; } } elseif (isset($rawDefinition['name']) && $rawDefinition['key'] != $rawDefinition['name']) { $subdivision['name'] = $rawDefinition['name']; } if (isset($rawDefinition['isoid'])) { $subdivision['iso_code'] = $countryCode . '-' . $rawDefinition['isoid']; } if (isset($rawDefinition['xzip'])) { $subdivision['postal_code_pattern'] = $rawDefinition['xzip']; $subdivision['postal_code_pattern_type'] = 'full'; } elseif (isset($rawDefinition['zip'])) { $subdivision['postal_code_pattern'] = $rawDefinition['zip']; // There are more than 12 000 subdivisions, but only a few Chinese // ones specify a full pattern. Therefore, the postal_code_pattern_type // value is the same for most subdivisions, and omitted to save space. } return $subdivision; } /** * Applies subdivision customizations. */ function apply_subdivision_customizations($subdivisions, $customizations) { if (empty($customizations)) { return $subdivisions; } $customizations += [ '_remove' => [], '_replace' => [], '_add' => [], '_add_after' => [], ]; foreach ($customizations['_remove'] as $removeId) { unset($subdivisions['subdivisions'][$removeId]); } foreach ($customizations['_replace'] as $replaceId) { $subdivisions['subdivisions'][$replaceId] = $customizations[$replaceId]; } foreach ($customizations['_add'] as $addId) { $subdivisions['subdivisions'][$addId] = $customizations[$addId]; } foreach ($customizations['_add_after'] as $addId => $nextId) { $position = array_search($nextId, array_keys($subdivisions['subdivisions'])); $new = [ $addId => $customizations[$addId], ]; // array_splice() doesn't support non-numeric replacement keys. $start = array_slice($subdivisions['subdivisions'], 0, $position); $end = array_slice($subdivisions['subdivisions'], $position); $subdivisions['subdivisions'] = $start + $new + $end; } return $subdivisions; } /** * Processes the locale string. */ function process_locale($locale) { // Be more precise when it comes to Chinese Simplified. if ($locale == 'zh') { $locale = 'zh-hans'; } return Locale::canonicalize($locale); } /** * Converts the provided format string into one recognized by the library. */ function convert_format($countryCode, $format) { if (empty($format)) { return null; } // Expand the recipient token into separate familyName/givenName tokens. // The additionalName field is not used by default. // Hardcode the list of countries that write the family name before the // given name, since the API doesn't give us that info. $reverseCountries = [ 'KH', 'CN', 'HU', 'JP', 'KO', 'MG', 'TW', 'VN', ]; if (in_array($countryCode, $reverseCountries)) { $format = str_replace('%N', '%N3 %N1', $format); } else { $format = str_replace('%N', '%N1 %N3', $format); } // Expand the address token into separate tokens for address lines 1 and 2. $format = str_replace('%A', '%1%n%2', $format); $replacements = [ '%S' => '%' . AddressField::ADMINISTRATIVE_AREA, '%C' => '%' . AddressField::LOCALITY, '%D' => '%' . AddressField::DEPENDENT_LOCALITY, '%Z' => '%' . AddressField::POSTAL_CODE, '%X' => '%' . AddressField::SORTING_CODE, '%1' => '%' . AddressField::ADDRESS_LINE1, '%2' => '%' . AddressField::ADDRESS_LINE2, '%O' => '%' . AddressField::ORGANIZATION, '%N3' => '%' . AddressField::FAMILY_NAME, '%N2' => '%' . AddressField::ADDITIONAL_NAME, '%N1' => '%' . AddressField::GIVEN_NAME, '%n' => '\n', // Remove hardcoded strings which duplicate the country name. '%nĂ…LAND' => '', 'JERSEY%n' => '', 'GUERNSEY%n' => '', 'GIBRALTAR%n' => '', 'SINGAPORE ' => '', ]; $format = strtr($format, $replacements); return $format; } /** * Converts google's field symbols to the expected values. */ function convert_fields($fields, $type) { if (is_null($fields)) { return null; } if (empty($fields)) { return []; } // Expand the name token into separate tokens. if ($type == 'required') { // The additional name is never required. $fields = str_replace('N', '79', $fields); } else { $fields = str_replace('N', '789', $fields); } // Expand the address token into separate tokens for address lines 1 and 2. // For required fields it's enough to require the first line. if ($type == 'required') { $fields = str_replace('A', '1', $fields); } else { $fields = str_replace('A', '12', $fields); } $mapping = [ 'S' => AddressField::ADMINISTRATIVE_AREA, 'C' => AddressField::LOCALITY, 'D' => AddressField::DEPENDENT_LOCALITY, 'Z' => AddressField::POSTAL_CODE, 'X' => AddressField::SORTING_CODE, '1' => AddressField::ADDRESS_LINE1, '2' => AddressField::ADDRESS_LINE2, 'O' => AddressField::ORGANIZATION, '7' => AddressField::FAMILY_NAME, '8' => AddressField::ADDITIONAL_NAME, '9' => AddressField::GIVEN_NAME, ]; $fields = str_split($fields); foreach ($fields as $key => $field) { if (isset($mapping[$field])) { $fields[$key] = $mapping[$field]; } } return $fields; } /** * Copy of SubdivisionRepository::buildGroup(). */ function build_group(array $parents) { if (empty($parents)) { throw new \InvalidArgumentException('The $parents argument must not be empty.'); } $countryCode = array_shift($parents); $group = $countryCode; if ($parents) { // A dash per key allows the depth to be guessed later. $group .= str_repeat('-', count($parents)); // Hash the remaining keys to ensure that the group is ASCII safe. // crc32b is the fastest but has collisions due to its short length. // sha1 and md5 are forbidden by many projects and organizations. // This is the next fastest option. $group .= hash('tiger128,3', implode('-', $parents)); } return $group; }