python

How to parse an RSS or Atom feed with Python and the Universal Feed Parser library

Tagged universal, feed, parser, atom, rss, python  Languages python

This example uses the Universal Feed Parser, one of the best and fastest parsers for Python.

Feed Parser is a lot faster than feed_tools for Ruby and it's about as fast as the ROME Java library according to my simple benchmark.

Feed Parser uses less memory and about as much of the CPU as ROME, but this wasn't tested with a long running process, so don't take my word for it.

import time
import feedparser

start = time.time()

feeds = [
    'http://..', 
    'http://'
]

for url in feeds:
  options = {
    'agent'   : '..',
    'etag'    : '..',
    'modified': feedparser._parse_date('Sat, 29 Oct 1994 19:43:31 GMT'),
    'referrer' : '..'
  }

  feed = feedparser.parse(url, **options)

  print len(feed.entries)
  print feed.feed.title.encode('utf-8')

end = time.time()

print 'fetch took %0.3f s' % (end-start)

How to use ElasticSearch with Python

Tagged elasticsearch, python, pyes  Languages python

This is a short example on how to use ElasticSearch with Python.

First install pyes (pyes documentation).

Then run this code:

# https://pyes.readthedocs.org/en/latest/references/pyes.es.html
# http://davedash.com/2011/02/25/bulk-load-elasticsearch-using-pyes/
from pyes import *

index_name = 'xxx'
type_name = 'car'

conn = ES('127.0.0.1:9200', timeout=3.5)

docs = [
    {"name":"good",  "id":'1'},
    {"name":"bad", "id":'2'},
    {"name":"ugly", "id":'3'}
]

# Bulk index
for doc in docs:
    # index(doc, index, doc_type, id=None, parent=None, force_insert=False, op_type=None, bulk=False, version=None, querystring_args=None)
    conn.index(doc, index_name, type_name, id=doc['id'], bulk=True)

print conn.refresh()

# Search
def search(query):
    q = StringQuery(query, default_operator="AND")
    result = conn.search(query=q, indices=[index_name])
    for r in result:
        print r


search("good")

You can also use CURL to verify that it works:

# Show index mapping
curl -vvv "http://127.0.0.1:9200/xxx/_mapping?pretty=1"

# Delete index
curl -XDELETE -vvv "http://127.0.0.1:9200/xxx"

# Search
curl -vvv "http://127.0.0.1:9200/xxx/_search?pretty=1"

SQLAlchemy example

Tagged sqlalchemy, python  Languages python

SQLAlchemy example:

from sqlalchemy import *
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker

engine = create_engine("mysql://username:password@localhost/database_name")
engine.echo = False

Base = declarative_base()

class Entry(Base):
    __tablename__ = "entries"
    id = Column(Integer, primary_key=True)
    title = Column(String(255))
    url = Column(String)

    def __repr__(self):
        return "<Entry('%s', '%s')>" % (self.title, self.url)

# Set up handles
entry_table = Entry.__table__
metadata = Base.metadata
metadata.create_all(engine)

# Start a session
Session = sessionmaker(bind=engine)
session = Session()
 
# Query entries
entries = session.query(Entry) \
    .filter(Entry.title != 'Zermatt')

# Print all entries
for entry in entries.all():
    print entry.update_named_entities()

# Print first entry
entry = entries.first()

# Update entry
entry.title = 'Zermatt, Verbier'

# Commit changes
session.commit()

A simple Python HTTP client

Tagged python, http, client  Languages python

A simple HTTP client I had laying around that I wrote a long time ago. It supports cookies, redirects and stuff:

#!/usr/bin/env python
#
#     Http
#
#     A simple HTTP client that supports persistent cookies
#

import cookielib
import httplib
#httplib.HTTPConnection.debuglevel = 1
import urllib2

class Http:
  def __init__(self, redirect_callback = None):
    self.redirect_callback = redirect_callback
    self.cookie_jar = cookielib.CookieJar()
    self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor (self.cookie_jar))

    urllib2.install_opener(self.opener)

  def get(self, url, headers = None):
    request = urllib2.Request(url, headers = headers)
    return self.execute_request(request)

  def post(self, url, headers = None, parameters = None):
    data = None
    if parameters != None:
      data = urllib.urlencode(parameters)

    request = urllib2.Request(url, data, headers)
    return self.execute_request(request)

  def execute_request(self, request):
    response = self.opener.open(request)
    # Check for redirect, maybe better way to do this
    if response.geturl() != request.get_full_url():
      if self.redirect_callback == None:
        raise "Redirected to '" + response.geturl() + "' but no redirect callback defined"
      else:
        self.redirect_callback(response)

    return response

How to retrieve information about Python errors in a C extension

Tagged python, pyeval_callobject, pyerr_fetch  Languages python
result = PyEval_CallObject(tmp_callback, args);
    // result == NULL means an error occured
    if (PyErr_Occurred()) {
        PyObject* ptype;
        PyObject* pvalue;
        PyObject* ptraceback;
        PyErr_Fetch(&ptype, &pvalue, &ptraceback);
        printf("Error occurred on line: %d", ((PyTracebackObject*)ptraceback)->tb_lineno);
        // Restore exception instead of disposing of it
        PyErr_Restore(ptype, pvalue, ptraceback);
        PyErr_Print();

        Py_XDECREF(ptype);
        Py_XDECREF(pvalue);
        Py_XDECREF(ptraceback);
    }

via http://www.ragestorm.net/tutorial?id=21

How to use a Python decorator wrapper to get a reference to the calling class instance

Tagged python, decorator, self  Languages python
def requires_authentication(method):
    """
    self points to a SheisseController instance instead of the decorator function.
    """
    def wrapper(self, *args, **kwargs):
        if self._requires_authentication == True and self._authenticated == False:
            return response('403 Forbidden or whatever')

        return method(self, *args, **kwargs)
    return wrapper

class SheisseController:
  @requires_authentication
  def index(self):

How to parse XML with Python's built-in ElementTree parser

Tagged elementtree, python, xml, parse  Languages python
from xml.etree.ElementTree import fromstring, tostring

namespace = 'https://xxx.com/xxx'
element = fromstring(xml)

device = element.find('.//{%s}Device' % namespace)
detail = device.find('.//{%s}Details' % namespace)
series = device.findall('.//{%s}Series' % namespace)

Watch out for namespaces...

How to generate n-grams with Python and NLTK

Tagged nltk, ngram, bigram, trigram, word gram  Languages python
import nltk
from nltk.util import ngrams

def word_grams(words, min=1, max=4):
    s = []
    for n in range(min, max):
        for ngram in ngrams(words, n):
            s.append(' '.join(str(i) for i in ngram))
    return s

print word_grams('one two three four'.split(' '))

How to install and use the mysql-python library

Tagged python, mysql, mysql-python, install  Languages python

First download mysql-python from http://sourceforge.net/projects/mysql-python.

Extract it and run:

python setup.py build
sudo python setup.py install

If you get this error you need to install python-dev package:

In file included from _mysql.c:29:
pymemcompat.h:10:20: error: Python.h: No such file or directory
_mysql.c:30:26: error: structmember.h: No such file or directory
In file included from /usr/include/mysql/mysql.h:44,
                 from _mysql.c:40:
.
.
.
_mysql.c:2808: warning: return type defaults to 'int'
_mysql.c: In function 'DL_EXPORT':
_mysql.c:2808: error: expected declaration specifiers before 'init_mysql'
_mysql.c:2886: error: expected '{' at end of input
error: command 'gcc' failed with exit status 1

Installing the python-dev package on Debian is done with apt-get or synaptic:

apt-get install python-dev

Installing the library should now work:

python setup.py build
python setup.py install

Next test the library in the python console:

import MySQLdb

# Note that this example uses UTF-8 encoding
conn = MySQLdb.connect(host='localhost', user='...', passwd='...', db='...', charset = "utf8", use_unicode = True)
cursor = conn.cursor()


cursor.execute ("SELECT * FROM cities")
rows = cursor.fetchall ()

for row in rows:
  print "%s, %s" % (row[0], row[1].encode('utf-8'))

print "Number of rows returned: %d" % cursor.rowcount

Don't forget to close the cursor and connection, and if you're inserting data commit before closing, because autocommit is disabled by default:

cursor.close ()
conn.commit ()
conn.close ()

For more information about MySQLdb see this article.

How to use Python's simplejson to read and write JSON data

Tagged simplejson, python, json  Languages python

First you need to install simplejson:

easy_install simplejson

Now you can dump data to JSON:

import simplejson as json

class Something:

    def __init__(self):
        self.test = "test"

    def to_json(self):
        return json.dumps(self.__dict__)

Or if you have complex objects:

import simplejson as json
class Something:

    def __init__(self):
        self.test = [Other('a', 'b'), Other('a', 'c')]

    def to_json(self):
        return json.dumps([p.__dict__ for p in self.devices])