Python (2) collections

python built-in data type: collections

Definition

namedtuple

factory function for creating tuple subclasses with named fields

deque

list-like container with fast appends and pops on either end

Counter

dict subclass for counting hashable objects

defaultdict

dict subclass that calls a factory function to supply missing values

OrderedDict

dict subclass that remembers the order entries were added

Source code link
Abstract Base Classes (ABCs): _abcoll.py
Collections: collections.py

Counter
(Note: some built operation for counter will skip the negative value. (ex: |, &, +, - ))

>>> from collections import Counter

>>> cnt = Counter()
>>> for word in ['red', 'blue', 'red', 'green', 'blue', 'blue']:
...     cnt[word] += 1
>>> cnt
Counter({'blue': 3, 'red': 2, 'green': 1})

# n most common elements
>>> cnt.most_common(1)
[('blue', 3)]

# n least common elements
>>> n = 1
>>> c.most_common()[:-n-1:-1]      
[('green', 1)] 

# the count of non-exist entry is 0
>>> cnt["orange"]
0

# removes the entry
>>> del cnt['blue']
>>> cnt
Counter({'green': 1, 'red': 3})

# a counter for a string                 
>>> Counter('gallahad')
Counter({'a': 3, 'd': 1, 'g': 1, 'h': 1, 'l': 2})

# a counter for a list
Counter(['red', 'blue', 'red', 'green', 'blue', 'blue', 'red', 'red'])
Counter({'blue': 3, 'red': 2, 'green': 1})

# a new counter from a dictionary
>>> Counter(words)
Counter({'blue': 2, 'red': 4})

Elements are subtracted from another counter.
>>> c = Counter(a=4, b=2, c=0, d=-2, e=10)
>>> d = Counter(a=1, b=2, c=3, d=4, f=-4)
>>> c.subtract(d)
>>> c
Counter({'a': 3, 'b': 0, 'c': -3, 'd': -6, 'e': 10, 'f': 4})

difference between method subtract and __sub__

def subtract(*args, **kwds):
    if not args:
        raise TypeError("descriptor 'subtract' of 'Counter' object "
                        "needs an argument")
    self = args[0]
    args = args[1:]
    if len(args) > 1:
        raise TypeError('expected at most 1 arguments, got %d' % len(args))
    iterable = args[0] if args else None
    if iterable is not None:
        self_get = self.get
        if isinstance(iterable, Mapping):
            for elem, count in iterable.items():
                self[elem] = self_get(elem, 0) - count
        else:
            for elem in iterable:
                self[elem] = self_get(elem, 0) - 1
    if kwds:
        self.subtract(kwds)

def __sub__(self, other):
    if not isinstance(other, Counter):
        return NotImplemented
    result = Counter()
    for elem, count in self.items():
        newcount = count - other[elem]
        if newcount > 0:
            result[elem] = newcount
    for elem, count in other.items():
        if elem not in self and count < 0:
            result[elem] = 0 - count
    return result

deque
difference between deque and list:
deques are a generalization of stacks and queues. Deques support thread-safe, memory efficient appends and pops from either side of the deque with approximately the same O(1) performance in either direction.
Though list objects support similar operations, they are optimized for fast fixed-length operations and incur O(n) memory movement costs for pop(0) and insert(0, v) operations which change both the size and position of the underlying data representation.

>>> from collections import deque
>>> d = deque('ghi')                 # make a new deque with three items
>>> for elem in d:                   # iterate over the deque's elements
....    print elem.upper()
G
H
I

>>> d
deque(['g', 'h', 'i'])

>>> d.append('j')                    # add a new entry to the right side
>>> d.appendleft('f')                # add a new entry to the left side
>>> d                                # show the representation of the deque
deque(['f', 'g', 'h', 'i', 'j'])

>>> d.pop()                          # return and remove the rightmost item
'j'
>>> d.popleft()                      # return and remove the leftmost item
'f'

>>> 'h' in d                         # search the deque
True

>>> d.extend('jkl')                  # add iterables
>>> d
deque(['g', 'h', 'i', 'j', 'k', 'l'])
>>> d.rotate(1)                      # right rotation
>>> d
deque(['l', 'g', 'h', 'i', 'j', 'k'])
>>> d.rotate(-1)                     # left rotation
>>> d
deque(['g', 'h', 'i', 'j', 'k', 'l'])

>>> deque(reversed(d))               # make a new deque in reverse order
deque(['l', 'k', 'j', 'i', 'h', 'g'])
>>> d.clear()                        # empty the deque
>>> d.pop()                          # popping from an empty deque will cause IndexError
IndexError: pop from an empty deque

>>> d.extendleft('abc')              # extendleft() iterate the input from first element
>>> d
deque(['c', 'b', 'a'])

Assign the size of deque

>>> deque('abcdefghi')
deque(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i'])

# Return the last 5 entries of an iterable'
>>> d = deque('abcdefghi', 5)
>>> d
deque(['e', 'f', 'g', 'h', 'i'])

>>> d.rotate(2)
deque(['h', 'i', 'e', 'f', 'g'])

>>> d.append('j')
deque(['i', 'e', 'f', 'g', 'j'])

>>> d.appendleft('k')
deque(['k', 'i', 'e', 'f', 'g'])

>>> d.pop()
'i'

>>> d
deque(['d', 'e', 'f', 'g'])

defaultdict
The first argument provides the initial value for the default_factory attribute; it defaults to None. All remaining arguments are treated the same as if they were passed to the dict constructor, including keyword arguments.
The first argument must be callable data type, not iterable. It provides a convinient way to group a sequence of instances based on customized data types.

# other abstract data collection
>>> from collections import defaultdict
>>> from collections import Counter
>>> s = [('yellow', ("a", 1)), ('blue', ("b", 2)), ('blue', ("b", 3)), ('blue', ("b", 10))]
>>> d = defaultdict(Counter)
>>> for k, v in s:
...     d[k][v]+=1
>>> d.items()
[('blue', Counter({('b', 2): 1, ('b', 3): 1, ('b', 10): 1})), ('yellow', Counter({('a', 1): 1}))]

# integer
>>> s = 'mississippi'
>>> d = defaultdict(int)
>>> for k in s:
...     d[k] += 1
>>> d.items()
[('i', 4), ('p', 2), ('s', 4), ('m', 1)]

# customized function
>>> def constant_factory(value):
...     return itertools.repeat(value).next
>>> d = defaultdict(constant_factory('<missing>'))
>>> d.update(name='John', action='ran')
>>> '%(name)s %(action)s to %(object)s' % d
'John ran to <missing>'

namedtuple
Namedtuples assign meaning to each position in a tuple and allow for more readable, self-documenting code. namedtuple instances are just as memory efficient as regular tuples because they do not have per-instance dictionaries. Each kind of namedtuple is represented by its own class, created by using the namedtuple() factory function. The arguments are the name of the new class and a string containing the names of the elements.

difference between namedtuple and regular tuple:
The standard tuple uses numerical indexes to access its members. Named tuples are especially useful for assigning field names to result tuples from file-parsing modules.

>>> from collections import namedtuple
>>> Point = namedtuple('Point', ['x', 'y'], verbose=False)
>>> p = Point(11, y=22)     # instantiate with positional or keyword arguments
>>> p[0] + p[1]             # indexable like the plain tuple (11, 22)
33
>>> x, y = p                # unpack like a regular tuple
>>> x, y
(11, 22)
>>> p.x + p.y               # fields also accessible by name
33
>>> p                       # readable __repr__ with a name=value style
Point(x=11, y=22)

>>> p = Point(x=11, y=22)   # _replace returns a new instance of the namedtuple replacing specified fields with new values
>>> p._replace(x=33)
Point(x=33, y=22)

_fields lists the field names. It is Useful for field inspection and for creating new named tuple types from existing namedtuples. Subclass is not useful for adding new, stored fields, so simply create a new named tuple type from the _fields attribute

>>> p._fields            # view the field names
('x', 'y')

>>> Color = namedtuple('Color', 'red green blue')
>>> Point._fields + Color._fields
('x', 'y', 'red', 'green', 'blue')
>>> Pixel = namedtuple('Pixel', Point._fields + Color._fields)
>>> Pixel(11, 22, 128, 255, 0)
Pixel(x=11, y=22, red=128, green=255, blue=0)

Build namedtuple object from csv file. _make is the buil-in method in class generation stage and makes a new instance from an existing sequence or iterable..

>>> EmployeeRecord = namedtuple('EmployeeRecord', 'name, age, title, department, paygrade')

>>> import csv
>>> for emp in map(EmployeeRecord._make, csv.reader(open("employees.csv", "rb"))):
...     print emp.name, emp.title

Any valid Python identifier may be used for a fieldname except for field name repeatition, names starting with an underscore, or conflict with Python keywords. In situations where a namedtuple is being created based on values outside of the control of the programm (such as to represent the rows returned by a database query, where the schema is not known in advance), set rename = True so the fields are renamed.

>>> with_class = namedtuple('Person', 'name class age gender', rename=True)
>>> with_class._fields
('name', '_1', 'age', 'gender')

>>> two_ages = namedtuple('Person', 'name age gender age', rename=True)
>>> two_ages._fields
('name', 'age', 'gender', '_3')

>>> with_underscore = namedtuple('Person', 'name,weight,_age,gender', rename=True)
>>> print with_underscore._fields
('name', 'weight', '_2', 'gender')

Retrieve a field value from namedtuple

>>> getattr(p, 'x')
11

To convert a dictionary to a named tuple, use the double-star-operator.

>>> d = {'x': 11, 'y': 22}
>>> Point(**d)
Point(x=11, y=22)

Method could be added or changed with a subclass. Here is how to add a calculated field and a fixed-width print format:

>>> class Point(namedtuple('Point', 'x y')):
...     __slots__ = ()
...     @property
...     def hypot(self):
...         return (self.x ** 2 + self.y ** 2) ** 0.5
...     def __str__(self):
...         return 'Point: x=%6.3f  y=%6.3f  hypot=%6.3f' % (self.x, self.y, self.hypot)
...
>>> for p in Point(3, 4), Point(14, 5/7.):
...     print p
Point: x= 3.000  y= 4.000  hypot= 5.000
Point: x=14.000  y= 0.714  hypot=14.018

OrderdDict

Ordered dictionaries are just like regular dictionaries but they remember the order that items were inserted. When iterating over an ordered dictionary, the items are returned in the order their keys were first added.

>>> # regular unsorted dictionary
>>> d = {'banana': 3, 'apple': 4, 'pear': 1, 'orange': 2}

>>> # dictionary sorted by key
>>> OrderedDict(sorted(d.items(), key=lambda t: t[0]))
OrderedDict([('apple', 4), ('banana', 3), ('orange', 2), ('pear', 1)])

>>> # dictionary sorted by value
>>> OrderedDict(sorted(d.items(), key=lambda t: t[1]))
OrderedDict([('pear', 1), ('orange', 2), ('banana', 3), ('apple', 4)])