python, analyzing csv files, part 2

Previously, we discussed analyzing CSV files, parsing the csv into a native python object that supports iteration while providing easy access to the data (such as a sum by column header).

For very large files this can be cumbersome, especially where more advanced analytics are desired.

I would like to keep the same simple interface but use an in-memory database connection, thus transforming the CSV files into database tables for deeper analysis.

For example, I would like to do the following (leveraging the builtin sqlite3 module):

>>>
>>> reports = Reports('/home/user/reports-1109')
>>> reports.billing_detail.sum('Tax Amount', {'Fiscal Period':'2011-09'})
Decimal('123321.1')
>>>
>>> reports.tax_summary.sum('Amount', {'Fiscal Period':'2011-09'})
Decimal('123321.1')
>>>
>>> len(reports.billing_detail)
719153
>>>
>>> curs = reports.conn.cursor()
>>> curs.execute('SELECT name FROM sqlite_master WHERE type="table"').fetchall()
[(u'billing_detail',), (u'billing_summary',)]
>>>

This approach can be orders of magnitude faster for even the most basic analysis. Furthermore, this allows OLAP cube analysis of the data from the CSV files, e.g.,

>>>
>>> curs.execute('CREATE TABLE t_fact(id TEXT UNIQUE, b INT, t INT, r INT)').fetchall()
[]
>>> curs.execute('CREATE INDEX IF NOT EXISTS idxt ON t_fact(id)').fetchall()
[]
>>>
>>> ## load some data into the fact table
>>> curs.execute('''INSERT OR REPLACE INTO t_fact(id,b,t,r)
                SELECT bd.%(id)s as id, bd.ROWID as b, ts.ROWID as t, rf.ROWID as r
                FROM billing_detail bd
                LEFT OUTER JOIN tax_summary ts ON bd.%(id)s = ts.%(tax_id)s
                LEFT OUTER JOIN refunds r ON bd.%(id)s = rf.%(ref_id)s
                ''' % query_dict).fetchall()
[]
>>>
>>> ## e.g., find billing records without tax summaries
>>> billings_without_tax = curs.execute('SELECT id FROM t_fact WHERE t IS NULL').fetchall()
>>>

Using the same Report and Reports objects discussed previously, the code can be modified to leverage a database connection to support this type of analytics:

class Report(collections.Mapping):
    def __init__(self, filehint, table = None, conn = sqlite3.connect(':memory:')):
        self.filename = Reports.find_report(filehint)
        self.info = []
        self.headers = []
        self.table = table
        self.conn = conn
        self.indexes = []
        self._load()

    def _load(self):
        logging.debug('loading %s' %(self.filename))
        curs = self.conn.cursor()
        fh = open(self.filename)
        reader = csv.reader(fh)
        self.info = reader.next()
        self.headers = reader.next()
        columns = ', '.join(['c'+str(x) for x in range(len(self.headers))])
        columnTypes = ' TEXT, '.join(['c'+str(x) for x in range(len(self.headers))]) + ' TEXT'
        try:
            curs.execute('CREATE TABLE %s(%s)' %(self.table, columnTypes))
        except sqlite3.OperationalError as e:
            logging.debug('%s -- using existing table' %(e))
        else:
            curs.executemany('INSERT INTO %s (%s) VALUES(%s)' %(
                self.table, columns,
                '?, ' * (len(self.headers) -1) + '?'
            ), reader)
            self.conn.commit()
        curs.close()

    def _column(self, key):
        if key.lower() not in [x.lower() for x in self.headers]:
            raise IndexError('%s not in %s'%(key, self.table))
        return 'c' + str([x.lower() for x in self.headers].index(key.lower()))

    def create_index(self, col):
        col = self._column(col)
        icol = 'i' + col
        if icol not in self.indexes:
            logging.debug('adding index %s to %s(%s)' %(icol, self.table, col))
            curs = self.conn.cursor()
            curs.execute('CREATE INDEX IF NOT EXISTS %s ON %s(%s)' %(icol, self.table, col))
            curs.close()
            self.indexes.append(icol)

    def __getitem__(self, key):
        curs = self.conn.cursor()
        res = list(curs.execute('SELECT * FROM %s WHERE ROWID = %s' %(self.table, key+1)).fetchall()[0])
        curs.close()
        return res

    def __iter__(self):
        curs = self.conn.cursor()
        self.__iter = curs.execute('SELECT * FROM %s' %(self.table))
        curs.close()
        return self

    def next(self):
        return self.__iter.next()

    def __len__(self):
        curs = self.conn.cursor()
        ret = curs.execute('SELECT COUNT(*) FROM %s' %(self.table)).fetchall()[0][0]
        curs.close()
        return ret

    def get(self, column, value):
        '''get rows where column matches value'''
        curs = self.conn.cursor()
        column = self._column(column)
        ret = curs.execute('SELECT * FROM %s WHERE %s = "%s"' %(self.table, column, value)).fetchall()
        curs.close()
        return ret

    def sum(self, col, filter = {}):
        curs = self.conn.cursor()
        _where = []
        for k,v in filter.iteritems():
            _where.append(' %s = "%s" ' %(self._column(k),v) )
        ret = Decimal(str(curs.execute('SELECT SUM(%s) FROM %s %s' %(
            self._column(col),
            self.table,
            ' WHERE ' + ' AND '.join(_where) if _where else ''
            )).fetchall()[0][0]))
        curs.close()
        return ret
Posted in data architecture, python, software architecture | Leave a comment

python, analyzing csv files, part 1

I would like to analyze a collection of CSV (comma-separated-values) files in python. Ideally, I would like to treat the csv data as a native python object.

For example,

>>> financial_detail = Report('financial-detail.csv')
>>> transactions = {}
>>> for row in financial_detail:
...   transactions.append(row['Transaction'])
...
>>> financial_detail.sum('Tax Amount')
Decimal('123456.10')
>>>

Additionally, I would like to easily retrieve row data by column name, e.g.,

>>> financial_detail = Report('financial-detail.csv')
>>>
>>> financial_detail[0]['Transaction']
'6324565'
>>>
>>> ## the above should be equivalent to
>>> financial_detail.headers.index('Transaction')
7
>>> financial_detail[0][7]
'6324565'
>>>

Given a directory of CSV files, I would like to retrieve these files (magically) by name, e.g.,

>>> reports = Reports('/directory/of/reports/')
>>> len(reports.sales_tax)
4884
>>>
>>> len(reports.finance_detail)
512916
>>> reports.sales_tax.sum('Amount') - reports.financial_detail.sum('Tax Amount')
Decimal('0.0')
>>>

This can be accomplished by leveraging the python builtin csv module, the collections.Mapping abstract base class, as well as the python magic __getattr__ methods, i.e.,

!/usr/bin/env python
# vim: set tabstop=4 shiftwidth=4 autoindent smartindent:
import csv, glob, os, sys
import collections
from decimal import Decimal

## set the logging level
import logging
logging.basicConfig(
  level=logging.DEBUG,
  format='%(levelname)s:%(filename)s:%(lineno)d -- %(message)s'
)

class Report(collections.Mapping):
    '''
    Interface into a CSV Report

    Parses the csv into an iterable collection and provides memory of pivot sums
    '''

    def __init__(self, *filehints):
        self.filename = Reports.find_report(*filehints)
        self.info = []
        self.headers = []
        self.data = []
        self.pivot_sums = {}
        self.load()

    def load(self):
        logging.debug('loading %s' %(self.filename))
        fh = open(self.filename)
        reader = csv.reader(fh)
        self.info = reader.next()
        self.headers = reader.next()
        for row in reader:
            self.data.append(row)
        self.curval = 0 

    def __getitem__(self, key):
        return VirtualRecord(self.data[key], self)

    def __iter__(self):
        self.curval = 0
        return self

    def next(self):
        if self.curval < 0 or self.curval >= (len(self.data) - 1):
            raise StopIteration
        else:
            self.curval += 1
            return VirtualRecord(self.data[self.curval], self)

    def __len__(self):
        return len(self.data)

    def sum(self, col):
        if col not in self.pivot_sums:
            logging.debug('computing sum for %s in %s' %(col, ', '.join(self.info)))
            sum = Decimal(0)
            index = col
            if isinstance(index, str):
                index = [x.lower() for x in self.headers].index(index.lower())
            for i in self.data:
                sum += Decimal(i[index]) if i[index] != '' else 0
            self.pivot_sums[col] = sum
        return self.pivot_sums[col]

class VirtualRecord():
    '''
    A virtual record within a Report object, only one instance exists at a time (not-thread safe)
    to avoid the memory crunch from large report files
    '''
    _instance = None
    def __new__(self, *args, **kwargs):
        if not self._instance:
            self._instance = super(VirtualRecord, self).__new__(self)
        return self._instance

    def __init__(self, row, parent):
        self.row = row
        self.parent = parent

    def __getitem__(self, key):
        index = key
        if isinstance(key, str):
            index = [x.lower() for x in self.parent.headers].index(key.lower())
        return self.row[index]

    def __len__(self):
        return len(self.row)

class Reports:
    '''
    Magic collection of billing reports
    '''

    reports = {}

    def __init__(self, reportdir = '.'):
        self.reportdir = reportdir

    def __getattr__(self, report):
        '''
        magically find reports within self.reportdir

        For example, if there is a report 'sales-tax.2011-09.20111018.csv'
        You can access this with an attribute 'sales_tax', e.g.,
        >>>
        >>> reports = Reports()
        >>> len(reports.sales_tax)
        4884
        >>>
        '''
        filename = Reports.find_report(self.reportdir, report.replace('_', '?') + '.*.csv')
        if filename is None or not os.stat(filename):
            return None
        if report not in self.reports:
            self.reports[report] = Report(filename, table=report)
        return self.reports[report]

    @classmethod
    def find_report(self, *hints):
        files = glob.glob( os.path.join(*hints) )
        if len(files) == 1:
            return files[0]
        elif len(files) == 0:
            logging.error('No reports found: find_report(%s)' %(os.path.join(*hints)))
        else:
            logging.error('Found too many matching reports: find_report(%s) => %s' %(os.path.join(*hints), ', '.join(files) ))
Posted in data architecture, python, software architecture | Leave a comment

python, unique files by content

I would like to retrieve a list of unique files by content rather than by filename.

That is, if spam.txt and eggs.txt both contained the same contents I want only one of them to return. A very simple approach is to compute a SHA-1 checksum on each file, and build a dictionary with the checksum as the unique key.

#!/usr/bin/env python
# vim: set tabstop=4 shiftwidth=4 autoindent smartindent:
import hashlib, sys
import logging

def _dupdoc(filelist):
	'''
	returns a list of unique files (by content rather than filename)
	that is, if spam.txt and eggs.txt both contained the same contents,
	only one filename will be returned
	'''
	shasums = {}
	for file in filelist:
		try:
			fh = open(file, 'rb')
			sha1 = hashlib.sha1(fh.read()).hexdigest()
			if sha1 not in shasums:
				shasums[sha1] = file
				logging.debug('%s %s' %(file, sha1))
		except IOError as e:
			logging.warning('could not open %s' %(file))
	uniquelist = [file for file in shasums.values()]
	return uniquelist

if __name__ == "__main__":
	'''
	command-line, accept either a list of files in STDIN
	or a single filename argument that contains a list of files
	'''

	filelist = []
	if len(sys.argv) > 1:
		fh = open(sys.argv[1], 'r')
		filelist = fh.readlines()
		fh.close()
	else:
		filelist = sys.stdin.readlines()
	filelist = [file.strip() for file in filelist]
	uniques = _dupdoc(filelist)
	for file in uniques:
		print file

The commandline __main__ portion of the program expects an optional command line argument, or if no argument is specified than a filelist will be read on STDIN, e.g.,

#  find test -type f | dupdoc
test/spam1.txt
test/spam9.txt
#
Posted in python, shell tips | Leave a comment

python daemon

I would like to create a python daemon, completely detaching from the terminal or parent process, and yet retaining any log handlers through the python logging module. There is a wonderful example at cookbook-278731 of a well-behaved daemon, and see PEP 3143.

Borrowing from these examples, here is a simple daemonize function that will retain thread-safe logging handlers that were setup through the logging module.

# vim: set tabstop=4 shiftwidth=4 autoindent smartindent:
'''
Daemon (python daemonize function)

Detach process through double-fork (to avoid zombie process), close all
file descriptors, and set working directory to root (to avoid umount problems)
'''

import os, resource
import logging

# target environment
UMASK = 0
WORKINGDIR = '/'
MAXFD = 1024
if (hasattr(os, "devnull")):
    REDIRECT_TO = os.devnull
else:
    REDIRECT_TO = "/dev/null"

def daemonize():
    '''Detach this process and run it as a daemon'''

    try:
        pid = os.fork() #first fork
    except OSError, e:
        raise Exception, "%s [%d]" % (e.strerror, e.errno)

    if (pid == 0): #first child
        os.setsid()
        try:
            pid = os.fork() #second fork
        except OSError, e:
            raise Exception, "%s [%d]" % (e.strerror, e.errno)

        if (pid == 0): #second child
            os.chdir(WORKINGDIR)
            os.umask(UMASK)
        else:
            os._exit(0)
    else:
        os._exit(0)

    #close all file descriptors except from non-console logging handlers
    maxfd = resource.getrlimit(resource.RLIMIT_NOFILE)[1]
    if (maxfd == resource.RLIM_INFINITY):
        maxfd = MAXFD
    filenos = []
    for handler in logging.root.handlers:
        if hasattr(handler, 'stream') and hasattr(handler.stream, 'fileno') and handler.stream.fileno() > 2:
            filenos.append( handler.stream.fileno() )
    for fd in range(0, maxfd):
        try:
            if fd not in filenos:
                os.close(fd)
        except OSError:
            pass

    #redirect stdin, stdout, stderr to null
    os.open(REDIRECT_TO, os.O_RDWR)
    os.dup2(0, 1)
    os.dup2(0, 2)

    return(0)

Simply call the daemonize() function within your application, e.g.,

import logging
import Daemon

logging.info('daemonize?')
Daemon.daemonize():
logging.info('daemonized! we are now safely detached')

From a shell, console logging will only appear before daemonize() was called, e.g.,

# python test/daemon.py
INFO:daemon.py:4 -- daemonize?
#

And the logfile output:

# cat test.log
2011-09-27 13:26:02,712 INFO:daemon.py:4 -- daemonize?
2011-09-27 13:26:02,717 INFO:daemon.py:6 -- daemonized! we are now safely detached
#
Posted in python, software architecture | Leave a comment

python logging

I would like customizable logging in python applications, and I would like to easily send log messages to multiple handlers without any modification of the application code. The built-in logging module provides a very robust and easy-to-use logging capability.

In it's simplest form, log messages will be sent to the console with minimal formatting, e.g.,

>>> import logging
>>>
>>> logging.warning('this is a warning')
WARNING:root:this is a warning

In fact, your application and module code can simply use the built-in logging methods which can inherit logging handlers set by the main application code.

Below is an example of a logging configuration that creates three logging handlers:

import logging

# Log to file
logging.basicConfig(
    filename='test.log',
    level=logging.INFO,
    format='%(asctime)-15s %(levelname)s:%(filename)s:%(lineno)d -- %(message)s'
)   

# Log to console
console = logging.StreamHandler()
console.setLevel(logging.DEBUG)
console.setFormatter(logging.Formatter('%(levelname)s:%(filename)s:%(lineno)d -- %(message)s'))
logging.getLogger().addHandler(console)

# Log to syslog
from logging.handlers import SysLogHandler
syslog = SysLogHandler(address='/dev/log')
syslog.setFormatter(logging.Formatter('%(asctime)-15s %(levelname)s:%(filename)s:%(lineno)d -- %(message)s'))
logging.getLogger().addHandler(syslog)

Once this module is imported, any logging calls to the root logger [e.g., logging.warning(), logging,critical(), etc] will be processed by all three handlers. You can add as many log handlers as needed while your module code maintains a very simple logging interface, e.g.,

import logging

logging.debug('this is a debug msg')
logging.info('this is an info msg')
logging.warning('this is a warning msg')
logging.error('this is an error msg')
logging.critical('this is a critical error msg')

The console output will look like:

INFO:test_log.py:4 -- this is an info msg
WARNING:test_log.py:5 -- this is a warning msg
ERROR:test_log.py:6 -- this is an error msg
CRITICAL:test_log.py:7 -- this is a critical error msg

The filelog (and syslog) will look like:

2011-09-26 12:30:52,521 INFO:test_log.py:4 -- this is an info msg
2011-09-26 12:30:52,522 WARNING:test_log.py:5 -- this is a warning msg
2011-09-26 12:30:52,522 ERROR:test_log.py:6 -- this is an error msg
2011-09-26 12:30:52,522 CRITICAL:test_log.py:7 -- this is a critical error msg
Posted in python, software architecture | Leave a comment

HTML5 canvas Mandelbrot

I would like to create an animated Mandelbrot visualization using JavaScript on an HTML5 <canvas> element. The Mandelbrot set, popularized by BenoƮt Mandelbrot, is the set of complex numbers that remain bounded under the function zn+1 = zn2 + c. This is known as an escape function; that is, regardless of the size of n, zn never "escapes". Computing the Mandelbrot set can be as computationally complex as desired for a given visualization.

In JavaScript, the escape function can be written as follows:

Mandelbrot.prototype.escapeFunc = function(x, y) {
  r = 0.0; i = 0.0; m = 0.0;
  j = 0;
  while ((j < this.max) && (m < 4.0)) {
    j++;
    m = r * r - i * i;
    i = 2.0 * r * i + y;
    r = m + x;
  }
  return j;
}

For a given HTML5 canvas element, such as

<canvas id="mandelbrot" width="512" height="512">

A Manelbrot set over the complex plane can be represented with the follow object

function Mandelbrot(m) {
  this.m = m;
  this.c = m.getContext("2d");
  this.width = m.width;
  this.height = m.height;
  this.SX = -1.75; // start value real
  this.EX = 0.6;    // end value real
  this.SY = -1.125; // start value imaginary
  this.EY = 1.125;  // end value imaginary
  this.xzoom = (this.EX - this.SX) / (this.width*1.0);
  this.yzoom = (this.EY - this.SY) / (this.height*1.0);
}

Given these functions, rendering a Mandelbrot set on an HTML5 canvas element is as simple as looping through each of the pixels of the canvas, calculating the escape value, and drawing the pixel. Here is a simple render function:

Mandelbrot.prototype.render = function() {
  var prev_h = 0;
  for (var x = 0; x < this.width; x=x+1) {
    for (var y = 0; y < this.height; y=y+1) {
      esc = this.escapeFunc(this.SX + x*this.xzoom, this.SY + y*this.yzoom);
      h = 360 * (esc/this.max)
      if (h != prev_h) {
         perc = Math.floor(100*(h/360))
         this.c.fillStyle='hsla('+ h + ','+ (perc+100) +'%,'+ (60-perc) +'%,'+ this.opacity  +')';
         prev_h = h;
      }
      this.c.fillRect(x,y,1,1);
    }
  }
}

If you have an HTML5 compatible browser you should see an animated example below:

your browser does not support the HTML5 canvas element


Posted in html, javascript | Leave a comment

javascript keyboard buffer, part 4

Previously, we created a JavaScript keyboard buffer that can edit text in a static html page as well as play back each key as it was typed.

I would like to combine all of this into a single JavaScript include called keylogger.js, and rather than specify each div that I want to edit I'd like to specify a classname such that all elements of that class will become editable, e.g.,

<div class="editable">
Spam and eggs are fantastic, although on second though...
</div>

<script src="keylogger.js"></script>
<script type="text/javascript" defer><!--
editableByClassname('editable');
--></script>
Spam and eggs are fantastic, although on second thought...


Please see the example, keylogger.html, that uses multiple editable div's in one page.

The editableByClassname(cls) function registers onclick events that will activate the keyboard buffer, editor, and replay functions for every element of the specified class.

//
// attach onclick event handlers by classname
function editableByClassname(cls) {
  editable = $$(cls);
  for (var i = 0; i < editable.length; i++) {
    editable[i].onclick = initEditable;
  }
}

There are two utility functions, $ and $$. The single $ is a shortcut to document.getElementById and the double $$ returns all elements by classname. I.e.,

//
// shortcut to getElementById
function $(el) {
  return document.getElementById(el);
}

//
// a simple getElementsByClassname implementation
function $$(cl) {
  var retnode = [];
  var myclass = new RegExp('\\b'+cl+'\\b');
  var elem = document.getElementsByTagName('*');
  for (var i = 0; i < elem.length; i++) {
    var classes = elem[i].className;
    if (myclass.test(classes)) retnode.push(elem[i]);
  }
  return retnode;
}
Posted in css, html, javascript, software architecture | Leave a comment

javascript keyboard buffer, part 3

Previously, we created a javascript keyboard buffer that can edit text in a static html page. Using those functions I'd like to add a buffer replay.

I would like to add an onclick event to a specified div such that it will activate the keyboard buffer and leverage the previously discussed handleKey() event handler, as well as the appendChar(el) function. E.g.,

//
// onclick event handler
// initialize this element for keyboard buffer
function initEditable() {
  selectElement(this);
  var SPECIAL = [8, 32, 37, 39, 222];
  document.onkeydown = handleKey(function(k) {return SPECIAL.contains(k)});
  document.onkeypress = handleKey(function(k) {return !SPECIAL.contains(k)});
}

targetDiv = $('target');
targetDiv.onclick = initEditable;

The function calls selectElement(this) which will need to modify the element to add a temporary link for starting a replay. Once selected, that div will be editable, I would like another temporary link that when pressed will remove all temporary links and remove the onkey events (making the div non-editable).

The following JavaScript will enable the target div to toggle between editable and non-editable:

//
// create <a> element
function newA(aid, alabel, ahref, fonclick) {
  var newA = document.createElement('a');
  newA.appendChild(document.createTextNode(alabel));
  newA.setAttribute('href', ahref);
  newA.setAttribute('id', aid);
  newA.onclick = fonclick;
  return newA;
}

//
// prepare <div>, add 'replay' and 'save' links
function selectElement(el) {
  clearSelected();
  origText = el.innerHTML;
  el.innerHTML = '';
  var newDiv = document.createElement('div');
  newDiv.innerHTML = origText;
  newDiv.setAttribute('id', 'editSelected');
  el.appendChild(newDiv);
  el.appendChild(newA('areplay', 'replay', '#', replayKeys));
  el.appendChild(newA('aclear', 'save', 'javascript:clearSelected()', null));
  el.onclick = null;
}

//
// clear any <div> that is intercepting onkey events
// and clear the buffer
function clearSelected() {
  document.onkeypress = null;
  document.onkeyup = null;
  document.onkeydown = null;
  BUFFER = [];
  bp = 0;
  var el = $('editSelected');
  if (el) {
    var oldText = el.innerHTML;
    var pel = el.parentNode;
    pel.removeChild(el);
    pel.removeChild($('areplay'));
    pel.removeChild($('aclear'));
    pel.innerHTML = oldText;
    pel.onclick = initEditable;
  }
}

When the target div is clicked it will activate the keyboard buffer such that all keystrokes will appear in the innerHTML of that div and link to a replayKeys() function. The replayKeys() function simply needs to process the BUFFER according to a timer, e.g.,

TIMER = 100;
//
// replay all keystrokes from buffer
function replayKeys() {
  el = $('editSelected');
  el.innerHTML = origText;
  bp = 0;
  replaying = setInterval(replayHandler, TIMEOUT);
}
function replayHandler() {
  el = $('editSelected');
  if (bp < BUFFER.length) {
    appendChar(el);
  } else {
    replaying ? clearInterval(replaying) : false;
  }
}

The built-in setInterval() function will call a specified function periodically according to the TIMEOUT value in milliseconds, and will continue running until the clearInterval() function is called.

Next, I'd like to combine all of this into a single JavaScript include such that it can register multiple editable div's.

Posted in javascript | Leave a comment

javascript keyboard buffer, part 2

Previously, we created a javascript keyboard buffer and next I'd like to process that buffer to edit text in a static html page.

Given the following simple html:

<div id="keyboardBufferTest">
Spam and eggs
</div>

I would like the keyboard buffer to modify the text inside that div element.

The following JavaScript function will read the buffer and modify the innerHTML of a named element:

//
// append a single character from BUFFER to an elements innerHTML
var bp = 0;
var ENTITIES = {
    13 : '',
    38 : '&',
    60 : '<',
    62 : '>',
   222 : "'"
}
function appendChar(el) {
  if (BUFFER[bp] == 8) {
    appendBackspace(el);
  } else if (BUFFER[bp] in ENTITIES) {
    el.innerHTML += ENTITIES[BUFFER[bp]];
  } else {
    el.innerHTML += String.fromCharCode(BUFFER[bp]);
  }
  bp++;
}
function appendBackspace(el) {
  var endpointer = 1;
  var plength = el.innerHTML.length;
  for (var i in ENTITIES) {
    var entity = ENTITIES[i];
    var entityCheck = el.innerHTML.substr(plength - entity.length)
    if (entity == entityCheck) {
      endpointer = entity.length;
      break;
    }
  }
  el.innerHTML = el.innerHTML.substr(0, plength - endpointer);
}

The BUFFER contains unicode values that need to be encoded into readable characters. In the above example the builtin String.fromCharCode() function is used. The appendChar(el) function simply processes one element at a time from the BUFFER and maintains a buffer pointer so that the BUFFER can be safely read while new keystrokes are recorded.

There is a very simply ENTITIES hash that serves only to maintain basic HTML encoding.

To use this function, simply pass in the element you want modified. This function can be added to the onkey event handler so that changes to a static html page will appear as you type them.

Next, I'd like to add a buffer replay such that changes to the html can be viewed multiple times exactly as they were typed (keystroke by keystroke).

Posted in javascript | Leave a comment

javascript keyboard buffer, part 1

I would like to capture keystrokes in JavaScript, save them to a buffer, and play them back exactly as they were typed.

There are three onkey events that can be assigned event handlers, these are,

  document.onkeydown = eventHandler
  document.onkeypress = eventHandler
  document.onkeyup = eventHandler

When one of these events is triggered, the unicode representation of a keystroke can be captured as follows,

function eventHandler(e) {
  var e = e || event;
  var key = e.keyCode ? e.keyCode : e.charCode;
  BUFFER.push(key);
}

This will add the unicode value from the onkey event to a global array called BUFFER.

Ideally, document.onkeypress would be sufficient to capture all interesting keystrokes, but implementations vary and not all keystrokes are available through onkeypress (e.g., backspace and arrow keys will not be recorded in all browsers).

However, onkeyup and onkeydown do not differentiate between uppercase and lowercase, or any key-combination involving Shift, Ctrl, Alt, etc. A combination of onkeydown and onkeypress can be used to capture the interesting keystrokes.

Some keys; such as Backspace, Space, and arrow keys; will be intercepted by the browser for navigation purposes. Capturing these can be tricky depending on the current focus. For example, when inside a form input or textarea this will be suppressed.

For demonstration sake I'd like to capture all the interesting keys and suppress them from going to the browser (essentially, capturing them in the BUFFER first before they are consumed by other tasks, such as moving the page down).

The following code will (when activated) capture and buffer the keystrokes,

function startCapture() {
  var SPECIAL = [8, 32, 37, 39, 222];
  document.onkeydown = handleKey(function(k) {return SPECIAL.contains(k)});
  document.onkeypress = handleKey(function(k) {return !SPECIAL.contains(k)});
}

//
// closure for onkey event handlers
// * parameter must be a compare function that
// * returns true IFF this events keyCode should be recorded
function handleKey(fcmpkey) { return function(e) {
  var e = e || event;
  var key = e.keyCode ? e.keyCode : e.charCode;
  if (fcmpkey(key)) {
    BUFFER.push(key);
    if (e.preventDefault) {
      e.stopPropagation();
      e.preventDefault();
    } else {
      e.cancelBubble = true;
      e.returnValue = false;
    }
  }
};}

The onkeydown event is used to handle the special characters that wouldn't otherwise be correctly captured by onkeypress. The e.preventDefault() and e.stopPropagation() will suspend the keystrokes from propagating to the browser (e.g., Backspace navigation). In IE this same behavior is accomplished by the e.cancelBubble and e.returnValue attributes.

Most versions of IE tend to short-circuit onkey events when returnValue is set to false, for example, if onkeydown sets the returnValue to false then onkeypress and onkeyup will never be called for that keystroke. Simply put, you cannot rely on the chain of onkey (i.e., onkeydown -> onkeypress -> onkeyup) events if you are trying to, for example, suppress backspace navigation.

Now that the keystrokes are captured in the buffer you can do anything you'd like with them -- you could add browser navigation, edit seemingly non-editable blocks of text, or replay the keystrokes at later times.

Next, I'd like to use the BUFFER to edit static html and replay everything that was typed.

Posted in javascript | Leave a comment

javascript prototype

I would to extend the functionality of a JavaScript class, for example, to add utility functions to a specific Object.

Object.prototype

The prototype property can be used to add new properties and functions to any object, including built-in objects. For example, you could extend an Array as follows:

//
// extend Array objects, test if value is in array
// e.g.,  [0,1,2].contains(2) == true
//        [0,1,2].contains('spam') == false
Array.prototype.contains = function(obj) {
  for (var i = 0; i < this.length; i++) {
    if (this[i] == obj) {
      return true;
    }
  }
  return false;
}

This will add the contains() function to all Array objects, even those already instantiated. The prototype property can be used to dynamically add new properties to all objects of a specific type, for example:

function Spam(name, eggs) {
  this.name = name;
  this.eggs = eggs;
}
function Foo(eggs) {
  this.eggs = eggs;
}

myspam = new Spam('Brian', 12);
myfoo = new Foo(36);

function eggsByDozen() {
  return this.eggs / 12;
}
Spam.prototype.dozens = eggsByDozen;
Foo.prototype.dozens = eggsByDozen;

myspam.dozens() // returns 1
myfoo.dozens() // returns 3

In this case, an existing function eggsByDozen() is applied to multiple classes, new objects and even previously instantiated objects of those types will have the new method.

Posted in javascript | Leave a comment

sql, users not in group

I would like to find all users not in a specific group, given the following database schema:

CREATE TABLE foobar_users (
    user_id INT UNSIGNED AUTO_INCREMENT PRIMARY KEY,
    username VARCHAR(20) UNIQUE NOT NULL
);

CREATE TABLE foobar_groups (
    group_id INT UNSIGNED AUTO_INCREMENT PRIMARY KEY,
    groupname VARCHAR(20) UNIQUE NOT NULL
);

CREATE TABLE foobar_users_groups (
    user_id INT UNSIGNED NOT NULL,
    group_id INT UNSIGNED NOT NULL,
    PRIMARY KEY(user_id, group_id)
);

There are two common approaches to this problem, one is to use an outer-join and the other approach is to use a sub-select. Here is a left outer join example:

SELECT u.user_id, u.username
FROM foobar_users u
LEFT JOIN (foobar_users_groups ug, foobar_groups g)
  ON u.user_id = ug.user_id
  AND g.group_id = ug.group_id
  AND g.groupname = 'DMV'
WHERE
  ug.user_id IS NULL

Alternatively, you can use a sub-select, e.g.,

SELECT u.user_id, u.username
FROM foobar_users u
WHERE u.user_id NOT IN (
  SELECT ug.user_id
  FROM foobar_groups g, foobar_users_groups ug
  WHERE ug.group_id = g.group_id
    AND g.groupname = 'DMV'
)

My personal preference, and arguably more efficient solution than the above two, is to use a sub-select with NOT EXISTS, as follows:

SELECT u.user_id, u.username
FROM foobar_users u
WHERE NOT EXISTS (
  SELECT NULL
  FROM foobar_groups g, foobar_users_groups ug
  WHERE ug.group_id = g.group_id
    AND ug.user_id = u.user_id
    AND g.groupname = 'DMV'
)
Posted in mysql | Leave a comment

css opacity in javascript

I want register JavaScript events to toggle CSS opacity on selectable images.

For example, given a div with a list of images like the following,

<div id="foo_images">
   <img src="foo.jpg" />
   <img src="bar.jpg" />
   <img src="baz.jpg" />
   ...
</div>

I would like these images to be 50% transparent but 80% visible during a mouseover. I would also like these images to be selectable, i.e., an image should become fully visible after clicking it.

I would like to do this in JavaScript without having to modify the html image list.

We could add transparency in the stylesheet, as well as the mouseover effect, e.g.,

#foo_images img {
   filter: alpha(opacity=50);
   opacity: 0.5;
}
#foo_images img:hover {
   filter: alpha(opacity=80);
   opacity: 0.8;
}

In practice, it is highly recommended to keep all presentation elements in the stylesheet, but for the sake of example we can adjust an objects transparency using JavaScript, e.g.,

  //set obj transparency to 50%
  obj.style.opacity = 0.5; //non-IE browsers
  obj.style.filter = 'alpha(opacity=50)'; //IE

Using this approach, we can register events with JavaScript so that the transparency will change as the cursor hovers over, e.g.,

obj.onmouseover = function() {
  obj.style.opacity = 0.8;
  obj.style.filter = 'alpha(opacity=80)';
}
obj.onmouseout = function() {
  obj.style.opacity = 0.5;
  obj.style.filter = 'alpha(opacity=50)';
}

This would obviously be a lot of repeated code so we can use JavaScript closures, e.g.,

function opacityC(obj, value) {return function() {
  obj.style.opacity = value/100;
  obj.style.filter = 'alpha(opacity=' + value + ')';
}}
obj.onmouseover = opacityC(obj,80);
obj.onmouseout = opacityC(obj,50);

Next, to make the images selectable we can assign onclick events. We'll need to toggle between selected and not-selected, so we'll dynamically add a selected attribute to the image object. We can use the same closure approach to assign a function to the onclick event, e.g.,

function toggleSelectC(obj) {return function() {
  if (obj.selected != 'selected') {
    opacityC(obj, 100)();
    obj.onmouseout = opacityC(obj, 100);
    obj.style.border = '1px solid black';
    obj.selected = 'selected';
  } else {
    obj.onmouseout = opacityC(obj, 50);
    obj.style.border = '1px solid white';
    obj.selected = '';
  }
}}

image1.onclick = toggleSelectC(image1);
image2.onclick = toggleSelectC(image2);

Finally, we will loop through all of the images in the div, set the initial transparency and add event handlers, e.g.,

  var obj = document.getElementById('foo_images');
  var images = obj.getElementsByTagName("img");
  for (var i = 0; i < images.length; i++) {
    var img = images[i];
    opacityC(img,50)();
    img.onmouseover = opacityC(img, 80);
    img.onmouseout = opacityC(img, 50);
    img.onclick = toggleSelectC(img);
  }

Putting this all together, you would have the following HTML and JavaScript:

<div id="foo_images">
   <img src="foo.jpg" />
   <img src="bar.jpg" />
   <img src="baz.jpg" />
</div>
<script type="text/javascript" defer><!--

/**
 * closure to adjust an objects transparency
 */
function opacityC(obj, value) {return function() {
  obj.style.opacity = value/100;
  obj.style.filter = 'alpha(opacity=' + value + ')';
}}

/**
 * closure to toggle selected/non-selected style
 */
function toggleSelectC(obj) {return function() {
  if (obj.selected != 'selected') {
    opacityC(obj, 100)();
    obj.onmouseout = opacityC(obj, 100);
    obj.style.border = '1px solid black';
    obj.selected = 'selected';
  } else {
    opacityC(obj, 50)();
    obj.onmouseout = opacityC(obj, 50);
    obj.style.border = '1px solid white';
    obj.selected = '';
  }
}}

/**
 * register all selectable images
 */
function registerSelectableImages(obj) {
  var images = obj.getElementsByTagName("img");
  for (var i = 0; i < images.length; i++) {
    var img = images[i];
    opacityC(img,50)();
    img.style.border = '1px solid white';
    img.onmouseover = opacityC(img, 80);
    img.onmouseout = opacityC(img, 50);
    img.onclick = toggleSelectC(img);
  }
}
registerSelectableImages(document.getElementById('foo_images'));
--></script>

Applying the above code to the three sample images will produce the following results:


Posted in css, html, javascript, software architecture | Leave a comment

html footer at the bottom

I want the footer of an html page to never be higher than the bottom of the browser window. In other words, if there's not enough content to fill a webpage, I want the footer to be at the bottom of the page (rather than hovering underneath the content).

See this example.

Assuming a standard html div layout, e.g.,

<div id="page">

  <div id="header">...</div>
  <div id="body">...</div>
  <div id="footer">...</div>

</div>

There is cross-browser approach to keep the footer at the bottom of the page. I have seen various approaches to this issue and all of them rely on a similar trick: First, enforce a min-height, and second, use margin/padding to position a fixed-height footer.

The cross-browser minimum height trick looks like this:

html, body {
    height: 100%;
}
#page {
    min-height: 100%;
    height: auto !important;
    height: 100%;
}

Non-IE (and newer IE) browsers will understand the min-height style, older versions require the height: auto !important; height: 100%; styles.

Next, we position a fixed-height footer. You can do this through a negative-margin that overlaps the body, or use position: absolute;. I prefer the latter approach as it looks cleaner. Either way, remember to pad the bottom of your body content so that the footer will never overlap actual content, e.g.,

#body {
    padding-bottom: 3em;
}
#footer {
    position: absolute;
    bottom: 0;
    height: 3em;
}

The footer height must be fixed, and the padding-bottom should be greater than or equal to the fixed footer height.

Putting it all together, please see this modified example which uses the following sticky footer css:

/* sticky footer */
html, body {
    height: 100%;
}
#page {
    min-height: 100%;
    height: auto !important;
    height: 100%;
    position: relative;
}
#body {
    padding-bottom: 3em;
}
#footer {
    position: absolute;
    bottom: 0;
    height: 3em;
}
Posted in css, html | Leave a comment

database indexes and optimization

Previously, I discussed database schema normalization. Next, I would like to add indexes and optimize a schema for high-volume production usage.

Consider the following schema:

CREATE TABLE foobar_users (
    user_id INT UNSIGNED AUTO_INCREMENT PRIMARY KEY,
    username VARCHAR(20) NOT NULL
);

CREATE TABLE foobar_groups (
    group_id INT UNSIGNED AUTO_INCREMENT PRIMARY KEY,
    groupname VARCHAR(20) NOT NULL
);

CREATE TABLE foobar_users_groups (
    user_id INT UNSIGNED NOT NULL,
    group_id INT UNSIGNED NOT NULL,
    PRIMARY KEY(user_id, group_id)
);

Each table shown is in Sixth Normal Form (6NF) and contain no indexes other than the primary key. The schema represents a typical user/group mapping where there is a many-to-many relationship between users and groups (i.e., a user can belong to multiple groups, and a group contains multiple users).

I've randomly generated and loaded data into the tables. There is roughly twice as many entries in the mapping table as the domain tables:

mysql> SELECT COUNT(*) FROM foobar_users;
+----------+
| COUNT(*) |
+----------+
|   121005 |
+----------+
1 row in set (0.02 sec)

mysql> SELECT COUNT(*) FROM foobar_users_groups;
+----------+
| COUNT(*) |
+----------+
|   240999 |
+----------+
1 row in set (0.00 sec)

Common queries will be noticeably slower with this amount of data. For example, getting a list of groups for a specific user can take over one second. Using the EXPLAIN command we can find out why, e.g.,

mysql> EXPLAIN -- fetch groups with 'bbob' as member
    -> SELECT g.group_id, g.groupname
    -> FROM foobar_users u, foobar_groups g, foobar_users_groups ug
    -> WHERE u.user_id = ug.user_id
    ->   AND g.group_id = ug.group_id
    ->   AND u.username = 'bbob';
+----+-------------+-------+--------+---------------+---------+---------+--------+-------------+
| id | select_type | table | type   | possible_keys | key     | key_len | rows   | Extra       |
+----+-------------+-------+--------+---------------+---------+---------+--------+-------------+
|  1 | SIMPLE      | ug    | index  | PRIMARY       | PRIMARY | 8       | 240999 | Using index |
|  1 | SIMPLE      | g     | eq_ref | PRIMARY       | PRIMARY | 4       |      1 |             |
|  1 | SIMPLE      | u     | eq_ref | PRIMARY       | PRIMARY | 4       |      1 | Using where |
+----+-------------+-------+--------+---------------+---------+---------+--------+-------------+

The culprit is the full table scan on foobar_users_groups, which is perhaps the worst thing we can do since it's the largest table with almost a quarter million rows. In the EXPLAIN output, if you look in the column identified as rows, you'll see that 240999 rows were scanned.

A natural join version of this same query performs just as poorly:

mysql> EXPLAIN -- natural join version
    -> SELECT group_id, groupname
    -> FROM foobar_users
    -> NATURAL JOIN foobar_users_groups
    -> NATURAL JOIN foobar_groups
    -> WHERE username = 'bbob';
+----+-------------+---------------------+--------+---------------+---------+---------+--------+-------------+
| id | select_type | table               | type   | possible_keys | key     | key_len | rows   | Extra       |
+----+-------------+---------------------+--------+---------------+---------+---------+--------+-------------+
|  1 | SIMPLE      | foobar_users_groups | index  | PRIMARY       | PRIMARY | 8       | 240999 | Using index |
|  1 | SIMPLE      | foobar_groups       | eq_ref | PRIMARY       | PRIMARY | 4       |      1 |             |
|  1 | SIMPLE      | foobar_users        | eq_ref | PRIMARY       | PRIMARY | 4       |      1 | Using where |
+----+-------------+---------------------+--------+---------------+---------+---------+--------+-------------+

Selecting which users belong to a specific group performs the same full table scan of foobar_users_groups. If we were to load millions of users and groups the performance would be unacceptable by any reasonable expectation.

mysql> EXPLAIN -- users in the MVD group
    -> SELECT u.user_id, u.username
    -> FROM foobar_users u, foobar_groups g, foobar_users_groups ug
    -> WHERE u.user_id = ug.user_id
    ->   AND g.group_id = ug.group_id
    ->   AND g.groupname = 'MVD';
+----+-------------+-------+--------+---------------+---------+---------+--------+-------------+
| id | select_type | table | type   | possible_keys | key     | key_len | rows   | Extra       |
+----+-------------+-------+--------+---------------+---------+---------+--------+-------------+
|  1 | SIMPLE      | ug    | index  | PRIMARY       | PRIMARY | 8       | 240999 | Using index |
|  1 | SIMPLE      | g     | eq_ref | PRIMARY       | PRIMARY | 4       |      1 | Using where |
|  1 | SIMPLE      | u     | eq_ref | PRIMARY       | PRIMARY | 4       |      1 |             |
+----+-------------+-------+--------+---------------+---------+---------+--------+-------------+

Indexes

I want to add indexes to these tables to prevent full table scans. In fact, one of the main problems is that we're starting with a username and groupname and neither of those are indexed, only the user_id and group_id have indexes (from the primary key). Since these tables are fully normalized, there is no reason that username and groupname were not given the UNIQUE identifier in the create table statements, e.g.,

CREATE TABLE foobar_users (
    user_id INT AUTO_INCREMENT PRIMARY KEY,
    username VARCHAR(20) NOT NULL UNIQUE
);

This minor change would not only have enforced a data integrity constraint but also would have created an index on username that would have alleviated the query performance issues.

Rather than re-create the table, we can add indexes to existing tables, as follows

mysql> CREATE UNIQUE INDEX username ON foobar_users(username);
Query OK, 121005 rows affected (1.72 sec)
Records: 121005  Duplicates: 0  Warnings: 0

mysql> CREATE UNIQUE INDEX groupname ON foobar_groups(groupname);
Query OK, 121005 rows affected (1.83 sec)
Records: 121005  Duplicates: 0  Warnings: 0

With these indexes in place, the query performance is noticeably improved, and the EXPLAIN command will show why:

mysql> EXPLAIN -- fetch groups with 'bbob' as member
    -> SELECT g.group_id, g.groupname
    -> FROM foobar_users u, foobar_groups g, foobar_users_groups ug
    -> WHERE u.user_id = ug.user_id
    ->   AND g.group_id = ug.group_id
    ->   AND u.username = 'bbob';
+----+-------------+-------+--------+------------------+----------+---------+------+-------------+
| id | select_type | table | type   | possible_keys    | key      | key_len | rows | Extra       |
+----+-------------+-------+--------+------------------+----------+---------+------+-------------+
|  1 | SIMPLE      | u     | const  | PRIMARY,username | username | 62      |    1 |             |
|  1 | SIMPLE      | ug    | ref    | PRIMARY          | PRIMARY  | 4       |   12 | Using index |
|  1 | SIMPLE      | g     | eq_ref | PRIMARY          | PRIMARY  | 4       |    1 |             |
+----+-------------+-------+--------+------------------+----------+---------+------+-------------+

The query is no longer scanning entire tables, in fact, the largest estimated row count is from the foobar_users_groups mapping table-- EXPLAIN estimates it needs to scan 12 rows, which corresponds correctly with the number of groups that will be returned.

However, querying users in a specific group remains slow, consider the following:

mysql> EXPLAIN -- users in the MVD group
    -> SELECT u.user_id, u.username
    -> FROM foobar_users u, foobar_groups g, foobar_users_groups ug
    -> WHERE u.user_id = ug.user_id
    ->   AND g.group_id = ug.group_id
    ->   AND g.groupname = 'MVD';
+----+-------------+-------+--------+-------------------+-----------+---------+--------+-------------+
| id | select_type | table | type   | possible_keys     | key       | key_len | rows   | Extra       |
+----+-------------+-------+--------+-------------------+-----------+---------+--------+-------------+
|  1 | SIMPLE      | g     | const  | PRIMARY,groupname | groupname | 62      |      1 |             |
|  1 | SIMPLE      | u     | ALL    | PRIMARY           | NULL      | NULL    | 121005 |             |
|  1 | SIMPLE      | ug    | eq_ref | PRIMARY           | PRIMARY   | 8       |      1 | Using index |
+----+-------------+-------+--------+-------------------+-----------+---------+--------+-------------+

The query is no longer doing a full table scan of foobar_users_groups, but it is doing a full table scan of the foobar_users table. The problem is, the primary key on foobar_users_groups is a composite key consisting of (user_id, group_id). The query (as written) is scanning all of the user_id's that might be part of that composite (user_id, group_id).

Adding a separate group_id index on the foobar_users_groups table will solve this, i.e.,

mysql> CREATE INDEX group_id ON foobar_users_groups(group_id);
Query OK, 240999 rows affected (1.03 sec)
Records: 240999  Duplicates: 0  Warnings: 0

mysql> EXPLAIN -- users in the MVD group
    -> SELECT u.user_id, u.username
    -> FROM foobar_users u, foobar_groups g, foobar_users_groups ug
    -> WHERE u.user_id = ug.user_id
    ->   AND g.group_id = ug.group_id
    ->   AND g.groupname = 'MVD';
+----+-------------+-------+--------+-------------------+-----------+---------+------+-------+
| id | select_type | table | type   | possible_keys     | key       | key_len | rows | Extra |
+----+-------------+-------+--------+-------------------+-----------+---------+------+-------+
|  1 | SIMPLE      | g     | const  | PRIMARY,groupname | groupname | 62      |    1 |       |
|  1 | SIMPLE      | ug    | ref    | PRIMARY,group_id  | group_id  | 4       |    8 |       |
|  1 | SIMPLE      | u     | eq_ref | PRIMARY           | PRIMARY   | 4       |    1 |       |
+----+-------------+-------+--------+-------------------+-----------+---------+------+-------+
Posted in data architecture, mysql | Leave a comment