1#!/usr/bin/env php 2<?php 3 4if ($argc < 2) { 5 echo "Usage: php gen_rare_cp_bitvec.php ./common_codepoints.txt\n"; 6 return; 7} 8 9$bitvec = array_fill(0, (0xFFFF / 32) + 1, 0xFFFFFFFF); 10 11$input = file_get_contents($argv[1]); 12foreach (explode("\n", $input) as $line) { 13 if (false !== $hashPos = strpos($line, '#')) { 14 $line = substr($line, 0, $hashPos); 15 } 16 17 $line = trim($line); 18 if ($line === '') { 19 continue; 20 } 21 22 $range = explode("\t", $line); 23 $start = hexdec($range[0]); 24 $end = hexdec($range[1]); 25 26 for ($i = $start; $i <= $end; $i++) { 27 $bitvec[$i >> 5] &= ~(1 << ($i & 0x1F)); 28 } 29} 30 31$result = <<<'HEADER' 32/* Machine-generated file; do not edit! See gen_rare_cp_bitvec.php. 33 * 34 * The below array has one bit for each Unicode codepoint from U+0000 to U+FFFF. 35 * The bit is 1 if the codepoint is considered 'rare' for the purpose of 36 * guessing the text encoding of a string. 37 * 38 * Each 'rare' codepoint which appears in a string when it is interpreted 39 * using a candidate encoding causes the candidate encoding to be treated 40 * as less likely to be the correct one. 41 */ 42 43static uint32_t rare_codepoint_bitvec[] = { 44HEADER; 45 46for ($i = 0; $i < 0xFFFF / 32; $i++) { 47 if ($i % 8 === 0) { 48 $result .= "\n"; 49 } else { 50 $result .= " "; 51 } 52 53 $result .= "0x" . str_pad(dechex($bitvec[$i]), 8, '0', STR_PAD_LEFT) . ","; 54} 55 56$result .= "\n};\n"; 57 58file_put_contents(__DIR__ . '/rare_cp_bitvec.h', $result); 59 60echo "Done.\n"; 61?> 62