From: Chuck Hagenbuch Date: Mon, 26 Jan 2009 03:45:15 +0000 (-0500) Subject: initial base parser port X-Git-Url: https://git.internetallee.de/?a=commitdiff_plain;h=f9438309c201e730cb205e709e88e7852d6c5ffe;p=horde.git initial base parser port --- diff --git a/framework/Date_Parser/lib/Horde/Date/Parser/Locale/Base.php b/framework/Date_Parser/lib/Horde/Date/Parser/Locale/Base.php index 436250c91..964a8090a 100644 --- a/framework/Date_Parser/lib/Horde/Date/Parser/Locale/Base.php +++ b/framework/Date_Parser/lib/Horde/Date/Parser/Locale/Base.php @@ -1,6 +1,7 @@ -module Chronic - class << self - +:guess). If no date or time can be found, @@ -28,7 +29,7 @@ module Chronic # given date or time. If you'd rather have the entire time span returned, # set :guess to +false+ and a Chronic::Span will be returned. # - # [:ambiguous_time_range] + # [:ambiguousTimeRange] # Integer or :none (defaults to 6 (6am-6pm)) # # If an Integer is given, ambiguous times (like 5:00) will be @@ -38,115 +39,136 @@ module Chronic # assume that means 5:00pm. If :none is given, no assumption # will be made, and the first matching instance of that time will # be used. - def parse(text, specified_options = {}) - # get options and set defaults if necessary - default_options = {:context => :future, - :now => Time.now, - :guess => true, - :ambiguous_time_range => 6} - options = default_options.merge specified_options - - # ensure the specified options are valid - specified_options.keys.each do |key| - default_options.keys.include?(key) || raise(InvalidArgumentException, "#{key} is not a valid option key.") - end - [:past, :future, :none].include?(options[:context]) || raise(InvalidArgumentException, "Invalid value ':#{options[:context]}' for :context specified. Valid values are :past and :future.") - - # store now for later =) - @now = options[:now] - - # put the text into a normal format to ease scanning - text = self.pre_normalize(text) - - # get base tokens for each word - @tokens = self.base_tokenize(text) - - # scan the tokens with each token scanner - [Repeater].each do |tokenizer| - @tokens = tokenizer.scan(@tokens, options) - end - - [Grabber, Pointer, Scalar, Ordinal, Separator, TimeZone].each do |tokenizer| - @tokens = tokenizer.scan(@tokens) - end - - # strip any non-tagged tokens - @tokens = @tokens.select { |token| token.tagged? } - - if Chronic.debug - puts "+---------------------------------------------------" - puts "| " + @tokens.to_s - puts "+---------------------------------------------------" - end - - # do the heavy lifting - begin - span = self.tokens_to_span(@tokens, options) - rescue - raise - return nil - end - - # guess a time within a span if required - if options[:guess] - return self.guess(span) - else - return span - end - end - + */ + public function parse($text, $specifiedOptions = array()) + { + // get options and set defaults if necessary + $defaultOptions = array( + 'context' => 'future', + 'now' => new Horde_Date, + 'guess' => true, + 'ambiguousTimeRange' => 6, + ); + $options = array_merge($defaultOptions, $specifiedOptions); + + // ensure the specified options are valid + foreach (array_keys($specifiedOptions) as $key) { + if (!isset($defaultOptions[$key])) { + throw new InvalidArgumentException("$key is not a valid option key"); + } + } + + if (!in_array($options['context'], array('past', 'future', 'none'))) { + throw new InvalidArgumentException("Invalid value " . $options['context'] . " for 'context' specified. Valid values are 'past', 'future', and 'none'"); + } + + // store now for later =) + $this->now = $options['now']; + + // put the text into a normal format to ease scanning + $text = $this->preNormalize($text); + + // get base tokens for each word + $tokens = $this->baseTokenize($text); + + // scan the tokens with each token scanner + foreach (array('Repater') as $tokenizer) { + $tokenizer = $this->componentFactory($tokenizer); + $tokens = $tokenizer->scan($tokens, $options); + } + + foreach (array('Grabber', 'Pointer', 'Scalar', 'Ordinal', 'Separator', 'Timezone') as $tokenizer) { + $tokenizer = $this->componentFactory($tokenizer); + $tokens = $tokenizer->scan($tokens); + } + + // strip any non-tagged tokens + $tokens = array_filter($tokens, create_function('$t', 'return $t->tagged();')); + + if (Horde_Date_Parser::$debug) { + echo "+---------------------------------------------------\n"; + echo "| " + implode(', ', $tokens) . "\n"; + echo "+---------------------------------------------------\n"; + } + + // do the heavy lifting + $span = $this->tokensToSpan($tokens, $options); + + // guess a time within a span if required + if ($options['guess']) { + return $this->guess($span); + } else { + return $span; + } + } + + /** # Clean up the specified input text by stripping unwanted characters, # converting idioms to their canonical form, converting number words # to numbers (three => 3), and converting ordinal words to numeric # ordinals (third => 3rd) - def pre_normalize(text) #:nodoc: - normalized_text = text.to_s.downcase - normalized_text = numericize_numbers(normalized_text) - normalized_text.gsub!(/['"\.]/, '') - normalized_text.gsub!(/([\/\-\,\@])/) { ' ' + $1 + ' ' } - normalized_text.gsub!(/\btoday\b/, 'this day') - normalized_text.gsub!(/\btomm?orr?ow\b/, 'next day') - normalized_text.gsub!(/\byesterday\b/, 'last day') - normalized_text.gsub!(/\bnoon\b/, '12:00') - normalized_text.gsub!(/\bmidnight\b/, '24:00') - normalized_text.gsub!(/\bbefore now\b/, 'past') - normalized_text.gsub!(/\bnow\b/, 'this second') - normalized_text.gsub!(/\b(ago|before)\b/, 'past') - normalized_text.gsub!(/\bthis past\b/, 'last') - normalized_text.gsub!(/\bthis last\b/, 'last') - normalized_text.gsub!(/\b(?:in|during) the (morning)\b/, '\1') - normalized_text.gsub!(/\b(?:in the|during the|at) (afternoon|evening|night)\b/, '\1') - normalized_text.gsub!(/\btonight\b/, 'this night') - normalized_text.gsub!(/(?=\w)([ap]m|oclock)\b/, ' \1') - normalized_text.gsub!(/\b(hence|after|from)\b/, 'future') - normalized_text = numericize_ordinals(normalized_text) - end - - # Convert number words to numbers (three => 3) - def numericize_numbers(text) #:nodoc: - Numerizer.numerize(text) - end - - # Convert ordinal words to numeric ordinals (third => 3rd) - def numericize_ordinals(text) #:nodoc: - text - end - - # Split the text on spaces and convert each word into - # a Token - def base_tokenize(text) #:nodoc: - text.split(' ').map { |word| Token.new(word) } - end - - # Guess a specific time within the given span - def guess(span) #:nodoc: - return nil if span.nil? - if span.width > 1 - span.begin + (span.width / 2) - else - span.begin - end - end - end - -end + */ + public function preNormalize($text) + { + $normalizedText = strtolower($text); + $normalizedText = $this->numericizeNumbers($normalizedText); + $normalizedText = preg_replace('/[\'"\.]/', '', $normalizedText); + $normalizedText = preg_replace('/([\/\-\,\@])/', ' \1 ', $normalizedText); + $normalizedText = preg_replace('/\btoday\b/', 'this day', $normalizedText); + $normalizedText = preg_replace('/\btomm?orr?ow\b/', 'next day', $normalizedText); + $normalizedText = preg_replace('/\byesterday\b/', 'last day', $normalizedText); + $normalizedText = preg_replace('/\bnoon\b/', '12:00', $normalizedText); + $normalizedText = preg_replace('/\bmidnight\b/', '24:00', $normalizedText); + $normalizedText = preg_replace('/\bbefore now\b/', 'past', $normalizedText); + $normalizedText = preg_replace('/\bnow\b/', 'this second', $normalizedText); + $normalizedText = preg_replace('/\b(ago|before)\b/', 'past', $normalizedText); + $normalizedText = preg_replace('/\bthis past\b/', 'last', $normalizedText); + $normalizedText = preg_replace('/\bthis last\b/', 'last', $normalizedText); + $normalizedText = preg_replace('/\b(?:in|during) the (morning)\b/', '\1', $normalizedText); + $normalizedText = preg_replace('/\b(?:in the|during the|at) (afternoon|evening|night)\b/', '\1', $normalizedText); + $normalizedText = preg_replace('/\btonight\b/', 'this night', $normalizedText); + $normalizedText = preg_replace('/(?=\w)([ap]m|oclock)\b/', ' \1', $normalizedText); + $normalizedText = preg_replace('/\b(hence|after|from)\b/', 'future', $normalizedText);; + $normalizedText = $this->numericizeOrdinals($normalizedText); + } + + /** + * Convert number words to numbers (three => 3) + */ + public function numericizeNumbers($text) + { + return Horde_Support_Numerizer::numerize($normalizedText, array('locale' => $this->locale)); + } + + /** + * Convert ordinal words to numeric ordinals (third => 3rd) + */ + public function numericizeOrdinals($text) + { + return $text; + } + + /** + * Split the text on spaces and convert each word into a Token + */ + public function baseTokenize($text) + { + return array_map(create_function('$w', 'return new Horde_Date_Parser_Token($w);'), preg_split('/\s+/', $text)); + } + + /** + * Guess a specific time within the given span + */ + public function guess($span) + { + if (empty($span)) { + return null; + } + if ($span->width > 1) { + return $span->begin + ($span->width() / 2); + } else { + return $span->begin; + } + } + +}