From 203650a6dbd6efd790d8a2989136ebc87bdf945e Mon Sep 17 00:00:00 2001 From: Chuck Hagenbuch Date: Mon, 3 Nov 2008 14:28:58 -0500 Subject: [PATCH] add initial numerizer implementation and tests --- .../lib/Horde/Support/Numerizer.php | 28 ++++++ .../lib/Horde/Support/Numerizer/Locale/Base.php | 112 +++++++++++++++++++++ .../test/Horde/Support/AllTests.php | 54 ++++++++++ .../test/Horde/Support/NumerizerTest.php | 62 ++++++++++++ 4 files changed, 256 insertions(+) create mode 100644 framework/Horde_Date_Parser/lib/Horde/Support/Numerizer.php create mode 100644 framework/Horde_Date_Parser/lib/Horde/Support/Numerizer/Locale/Base.php create mode 100644 framework/Horde_Date_Parser/test/Horde/Support/AllTests.php create mode 100644 framework/Horde_Date_Parser/test/Horde/Support/NumerizerTest.php diff --git a/framework/Horde_Date_Parser/lib/Horde/Support/Numerizer.php b/framework/Horde_Date_Parser/lib/Horde/Support/Numerizer.php new file mode 100644 index 000000000..3bc0d98af --- /dev/null +++ b/framework/Horde_Date_Parser/lib/Horde/Support/Numerizer.php @@ -0,0 +1,28 @@ + '11', + 'twelve' => '12', + 'thirteen' => '13', + 'fourteen' => '14', + 'fifteen' => '15', + 'sixteen' => '16', + 'seventeen' => '17', + 'eighteen' => '18', + 'nineteen' => '19', + 'ninteen' => '19', // Common mis-spelling + 'zero' => '0', + 'one' => '1', + 'two' => '2', + 'three' => '3', + 'four(\W|$)' => '4$1', // The weird regex is so that it matches four but not fourty + 'five' => '5', + 'six(\W|$)' => '6$1', + 'seven(\W|$)' => '7$1', + 'eight(\W|$)' => '8$1', + 'nine(\W|$)' => '9$1', + 'ten' => '10', + '\ba[\b^$]' => '1', // doesn't make sense for an 'a' at the end to be a 1 + ); + + public $TEN_PREFIXES = array( + 'twenty' => 20, + 'thirty' => 30, + 'fourty' => 40, + 'fifty' => 50, + 'sixty' => 60, + 'seventy' => 70, + 'eighty' => 80, + 'ninety' => 90, + ); + + public $BIG_PREFIXES = array( + 'hundred' => 100, + 'thousand' => 1000, + 'million' => 1000000, + 'billion' => 1000000000, + 'trillion' => 1000000000000, + ); + + public function numerize($string) + { + // preprocess + // will mutilate hyphenated-words but shouldn't matter for date extraction + $string = preg_replace('/ +|([^\d])-([^d])/', '$1 $2', $string); + // take the 'a' out so it doesn't turn into a 1, save the half for the end + $string = str_replace('a half', 'haAlf', $string); + + // easy/direct replacements + foreach ($this->DIRECT_NUMS as $dn => $dn_replacement) { + $string = preg_replace("/$dn/i", $dn_replacement, $string); + } + + // ten, twenty, etc. + foreach ($this->TEN_PREFIXES as $tp => $tp_replacement) { + $string = preg_replace_callback( + "/(?:$tp)( *\d(?=[^\d]|\$))*/i", + create_function( + '$m', + 'return ' . $tp_replacement . ' + (isset($m[1]) ? (int)$m[1] : 0);' + ), + $string); + } + + // hundreds, thousands, millions, etc. + foreach ($this->BIG_PREFIXES as $bp => $bp_replacement) { + $string = preg_replace_callback( + '/(\d*) *' . $bp . '/i', + create_function( + '$m', + 'return ' . $bp_replacement . ' * (int)$m[1];' + ), + $string); + $string = $this->_andition($string); + } + + // fractional addition + // I'm not combining this with the previous block as using float addition complicates the strings + // (with extraneous .0's and such ) + $string = preg_replace_callback( + '/(\d+)(?: | and |-)*haAlf/i', + create_function( + '$m', + 'return (string)((float)$m[1] + 0.5);' + ), + $string); + + return $string; + } + + protected function _andition($string) + { + while (true) { + if (preg_match('/(\d+)( | and )(\d+)(?=[^\w]|$)/i', $string, $sc, PREG_OFFSET_CAPTURE)) { + if (preg_match('/and/', $sc[2][0]) || $sc[1][0] > $sc[3][0]) { + $string = substr($string, 0, $sc[1][1]) . ((int)$sc[1][0] + (int)$sc[3][0]) . substr($string, $sc[3][1] + strlen($sc[3][0])); + continue; + } + } + break; + } + return $string; + } + +} diff --git a/framework/Horde_Date_Parser/test/Horde/Support/AllTests.php b/framework/Horde_Date_Parser/test/Horde/Support/AllTests.php new file mode 100644 index 000000000..7fac8b189 --- /dev/null +++ b/framework/Horde_Date_Parser/test/Horde/Support/AllTests.php @@ -0,0 +1,54 @@ +isFile() && preg_match('/Test.php$/', $file->getFilename())) { + $pathname = $file->getPathname(); + require $pathname; + + $class = str_replace(DIRECTORY_SEPARATOR, '_', + preg_replace("/^$baseregexp(.*)\.php/", '\\1', $pathname)); + $suite->addTestSuite('Horde_Support_' . $class); + } + } + + return $suite; + } + +} + +if (PHPUnit_MAIN_METHOD == 'Horde_Support_AllTests::main') { + Horde_Support_AllTests::main(); +} diff --git a/framework/Horde_Date_Parser/test/Horde/Support/NumerizerTest.php b/framework/Horde_Date_Parser/test/Horde/Support/NumerizerTest.php new file mode 100644 index 000000000..76c2d4a2b --- /dev/null +++ b/framework/Horde_Date_Parser/test/Horde/Support/NumerizerTest.php @@ -0,0 +1,62 @@ + 'one', + 5 => 'five', + 10 => 'ten', + 11 => 'eleven', + 12 => 'twelve', + 13 => 'thirteen', + 14 => 'fourteen', + 15 => 'fifteen', + 16 => 'sixteen', + 17 => 'seventeen', + 18 => 'eighteen', + 19 => 'nineteen', + 20 => 'twenty', + 27 => 'twenty seven', + 31 => 'thirty-one', + 59 => 'fifty nine', + 100 => 'a hundred', + 100 => 'one hundred', + 150 => 'one hundred and fifty', + // 150 => 'one fifty', + 200 => 'two-hundred', + 500 => '5 hundred', + 999 => 'nine hundred and ninety nine', + 1000 => 'one thousand', + 1200 => 'twelve hundred', + 1200 => 'one thousand two hundred', + 17000 => 'seventeen thousand', + 21473 => 'twentyone-thousand-four-hundred-and-seventy-three', + 74002 => 'seventy four thousand and two', + 99999 => 'ninety nine thousand nine hundred ninety nine', + 100000 => '100 thousand', + 250000 => 'two hundred fifty thousand', + 1000000 => 'one million', + 1250007 => 'one million two hundred fifty thousand and seven', + 1000000000 => 'one billion', + 1000000001 => 'one billion and one', + ); + + foreach ($strings as $key => $string) { + $this->assertEquals($key, (int)$numerizer->numerize($string)); + } + } + +} -- 2.11.0