ngram snippets

Generating word n-grams with Ruby

Tagged ngram, bigram, unigram, ruby  Languages ruby

How to generate word n-grams with Ruby:

require 'set'

#
# Extract uni, bi, and trigrams.
#
# "one two three".ngram(1) # unigrams
# "one two three".ngram(2) # bigrams
# "one two three".ngram(3) # trigrams
# "one two three".ngrams(1..3) # unigrams, bigrams, trigrams
#
module Ngram
  REGEX = /\w+/
  def ngram_tokenize
     self.downcase.scan(REGEX)
  end

  def ngram(n=1)
    res = Set.new
    words = ngram_tokenize
    word_count = words.length
    words.each_with_index do |word, ix|
      min = ix
      max = ix + (n-1)
      break if word_count <= max
      res.add words[min..max].join(' ')
    end
    res
  end

  def ngrams(range=1..3)
    return ngram(range) if range.is_a?(Fixnum)
    res = Set.new
    range.each do |n|
      res.merge ngram(n)
    end
    res
  end
end

class String
  include Ngram
end

"MSFT is xxx. AAPL is yyy.".ngram # unigram
"MSFT is xxx. AAPL is yyy.".ngram(2) # bigram
"MSFT is xxx. AAPL is yyy.".ngram(3) # trigram
"MSFT is xxx. AAPL is yyy.".ngrams(1..3) # uni, bi, trigram

How to generate n-grams with Python and NLTK

Tagged nltk, ngram, bigram, trigram, word gram  Languages python
import nltk
from nltk.util import ngrams

def word_grams(words, min=1, max=4):
    s = []
    for n in range(min, max):
        for ngram in ngrams(words, n):
            s.append(' '.join(str(i) for i in ngram))
    return s

print word_grams('one two three four'.split(' '))

ElasticSearch Wildcard and NGram Search With Tire

Tagged ngram, wildcard, elasticsearch, tire  Languages ruby

How to implement wildcard search with Tire and Elasticsearch:

settings analysis: {
    filter: {
      ngram_filter: {
        type: "nGram",
        min_gram: 1,
        max_gram: 15
      }
    },
    analyzer: {
      index_ngram_analyzer: {
        tokenizer: "standard",
        filter: ['standard', 'lowercase', "stop", "ngram_filter"],
        type: "custom"
      },
      search_ngram_analyzer: {
        tokenizer: "standard",
        filter: ['standard', 'lowercase', "stop"],
        type: "custom"
      }
    }
  }

  mapping do
    indexes :name,
      search_analyzer: 'search_ngram_analyzer',
      index_analyzer: 'index_ngram_analyzer', 
      #analyzer: 'index_ngram_analyzer', 
      boost: 100.0
      # …
  end

With curl, make sure the mapping is set up properly:

curl 'http://localhost:9200/activities/_mapping?pretty=true'
{
  "skulls" : {
    "skull" : {
      "_all" : {
        "auto_boost" : true
      },
      "properties" : {
        "name" : {
          "type" : "string",
          "boost" : 100.0,
          "analyzer" : "index_ngram_analyzer"
        }
      }
    }
  }
}

You now have wildcard search as long as you remember to specify the fields that you want to search, because by default the _all field is used for search:

# This searches the _all field
curl 'http://localhost:9200/activities/_search?q=simpsons&pretty=true'

# Yes, it really works
curl -XGET 'http://localhost:9200/activities/_search?pretty' -d ' 
{ 
   "query" : { 
      "query_string" : { 
         "query" : "simpsons", 
         "fields" : ["name"] 
      } 
   } 
}'