1 """
2 Read, query and update textual tables via flexible SQL interface.
3
4 L{Table}s can be created and populated with data from scratch, built from TSV
5 files, 2D lists or other tables. Once the data is loaded in memory, each
6 storage operation on the table object is delegated via bridge to an SQL
7 storage backend (by default this is SQLite). However the table uses the backend
8 only as a temp storage to ensure maximum portability of the data. Tables can be
9 stored persistently as text (TSV) files and then loaded back in memory when
10 needed.
11
12 These Tables can be queried and updated in a vast number of ways; each query
13 returns a new L{Table}:
14
15 1. Using slice expressions. The general form of a slice expression is
16 C{[rows, columns]}, where C{rows} can be:
17
18
19 - a row index, 0-based, e.g. C{5}
20 - a tuple of row indices, e.g. C{(1, 3, 6)}
21 - a standard Python slice, e.g. C{1:3} or C{:5} or C{:}
22 - omitted (means: all rows)
23
24 and C{columns} can be:
25
26 - a column index, 0-based, e.g. C{5}
27 - a tuple of columns indices, 0-based
28 - a column name, e.g. C{'TmScore'}
29 - a tuple of column names, e.g. C{('ID', 'TmScore')}
30 - a standard Python slice using column indices
31 - a slice using column names, e.g. C{'ID':'TM'} or C{:'TM'} or C{:}
32 - omitted (means: all columns)
33
34 2. Using query expressions, for example:
35
36
37 >>> table.where('ID').between(1, 5).select('TmScore', 'RMSD')
38 Table ('TmScore', 'RMSD')
39
40 >>> table.where('ID').between(1, 5).update('RMSD', 0.2)
41 Table (the same table)
42
43 3. With SQL queries:
44
45
46 >>> t.query(r'''SELECT ColumnB * ColumnA AS ComputedValue
47 FROM {0.name}
48 WHERE ColumnC IN ({1}, {1})'''.format(t, Predicate.PH),
49 [12, 55])
50 iterable
51
52 The data contained in a Table can be extracted in several ways:
53
54 - if you need a single (scalar) value -- with the C{table[row, column]}
55 indexing expression or with the dedicated C{table.scalar(row, column)} method.
56 - by treating the table as an iterator; each cycle will then yield a L{DataRow}
57 object
58 - with text (TSV) serialization: simply call C{table.dump(file)}.
59
60 See L{Table} for full API details.
61 """
62
63 try:
64 import __builtin__ as builtins
65 except ImportError:
66 import builtins
67
68 import os
69
70 import csb.io
71 import csb.core
72
73 from abc import ABCMeta, abstractmethod, abstractproperty
77 """
78 Abstract SQL backend interface. Defines a number of platform-specific
79 operations, that each concrete backend implementor must provide.
80 """
81
82 __metaclass__ = ABCMeta
83
85 self._table = tablename
86
87 @abstractproperty
90
91 @property
93 """
94 Table name
95 @rtype: str
96 """
97 return self._table
98
99 - def query(self, sql, params=None):
100 """
101 Execute a native SQL query against the backend, as-is.
102
103 @param sql: SQL query
104 @type sql: str
105 @param params: query bound parameters, if any
106 @type params: tuple
107
108 @return: data reader (2D iterable)
109 """
110 raise NotImplementedError()
111
112 @abstractmethod
114 """
115 Count the number of rows in the table.
116
117 @rtype: int
118 """
119 pass
120
121 @abstractmethod
123 """
124 Perform a select operation given L{expression}.
125
126 @type expression: L{Expression}
127 @return: data reader (2D iterable)
128 """
129 pass
130
131 @abstractmethod
132 - def update(self, expression):
133 """
134 Perform an update operation given L{expression}.
135
136 @type expression: L{Expression}
137 @return: void
138 """
139 pass
140
141 @abstractmethod
143 """
144 Insert a new tuple in the table.
145
146 @type row: tuple
147 @return: void
148 """
149 pass
150
151 @abstractmethod
153 """
154 Create a table given L{metadata}.
155
156 @type metadata: tuple of L{ColumnInfo}
157 @return: void
158 """
159 pass
160
161 @abstractmethod
163 """
164 Perform cleanup (e.g. close connections).
165 """
166 pass
167
170
173
175 """
176 SQLite-based concrete repository implementor.
177 This is the default L{Table} backend.
178 """
179
180 PK = 'ROWID'
181 TYPES = { int: 'BIGINT', float: 'REAL', str: 'VARCHAR' }
182
184
185 SIZE = 10000
186
188 self._cursor = cursor
189
191 try:
192 while True:
193 rows = self._cursor.fetchmany(self.SIZE)
194 if not rows:
195 break
196 else:
197 for row in rows:
198 yield row
199 finally:
200 self._cursor.close()
201
209
210 @property
213
214 - def _cursor(self, sql, params=None):
221
222 - def query(self, sql, params=None):
223
224 return self._cursor(sql, params).fetchall()
225
227
228 query = 'SELECT COUNT(*)\nFROM {0}\n'.format(self.table)
229 return self._cursor(query).fetchone()[0]
230
243
256
263
265
266 cols = []
267
268 for ci in metadata:
269 type = self._gettype(ci.type)
270 cols.append('{0} {1}'.format(ci.name, type))
271
272 statement = 'CREATE TABLE {0} (\n {1}\n);'.format(self.table, ',\n '.join(cols))
273
274 self._conn.execute(statement)
275
281
283 try:
284 return self._conn.close()
285 except:
286 pass
287
289 """
290 Holder object for column metadata.
291
292 @param name: column name
293 @type name: str
294 @param type: column data type (Python)
295 @type type: type
296 """
297
301
302 @property
304 """
305 Colulmn name
306 @rtype: str
307 """
308 return self._name
309
310 @property
312 """
313 Python data type
314 @rtype: class
315 """
316 return self._type
317
319 return '{0.name}:{0.type.__name__}'.format(self)
320
322 """
323 @return: a deep copy of C{self}
324 """
325 return ColumnInfo(self.name, self.type)
326
328 """
329 Represents a table data row. This is basically what a table iterator
330 yields for each row in a table. Provides both index (position) and
331 column name-based access to the data.
332 """
333
334 - def __init__(self, columns, number, row):
335
336 self._number = number
337 self._row = tuple(row)
338 self._columns = {}
339 for i, c in enumerate(columns):
340 self._columns[c] = i
341 assert len(self._columns) == len(self._row)
342
344 return iter(self._row)
345
347
348 if isinstance(i, csb.core.string):
349 return self._row[self._columns[i]]
350 else:
351 return self._row[i]
352
354 return len(self._row)
355
357 return '{0}: {1}'.format(self.__class__.__name__, repr(self._row))
358
361
362 - def dump(self, delimiter='\t'):
363 """
364 Dump the row as a string.
365
366 @param delimiter: column separator (defaults to tab)
367 @type delimiter: str
368 """
369 return delimiter.join(map(Table._isnull, self._row))
370
371 @property
373 """
374 Available column names
375 @rtype: tuple
376 """
377 c = dict((self._columns[cn], cn) for cn in self._columns)
378 return tuple(c[i] for i in sorted(c))
379
380 @property
382 """
383 Row number
384 @rtype: int
385 """
386 return self._number
387
389 """
390 Build and query a TSV Table. See the documentation of L{csb.io.tsv} for
391 details and examples.
392
393 @param definition: column definition string: C{ColA:typeA colB:typeB ...},
394 where C{ColN} is a column name and C{typeN} is one of the
395 base Python data types: str, int, long, float.
396 Alternatively, the table definition may be specified
397 directly as a list of metadata objects.
398 @type definition: str, tuple of L{ColumnInfo}
399 @param name: name of the table on the SQL backend. Useful when you need to
400 execute native SQL queries against the table.
401 @type name: str
402 @param backend: table backend storage engine. This must be a proper
403 L{RepositoryImp} bridge implementor.
404 @type backend: type (reference to a L{RepositoryImp} subclass)
405
406 @raise UnsupportedTypeError: when an unsupported type is used in the table
407 C{definition}
408 @raise ValueError: if the C{definition} is not valid
409 """
410
411 """
412 Table header string, used when saving and restoring TSV files.
413 """
414 HEADER = '# @TSV '
415 NULL = ''
416
418
419 if not issubclass(backend, RepositoryImp):
420 raise TypeError('The Table Backend must be a Repository Implementor')
421
422 self._name = name
423 self._backend = backend
424 self._imp = backend(name)
425 self._metadata = []
426
427 try:
428 if isinstance(definition[0], ColumnInfo):
429 self._metadata = [ c.copy() for c in definition ]
430 else:
431 if isinstance(definition, csb.core.string):
432 definition = [ (d.split(':')[0], getattr(builtins, d.split(':')[1])) for d in definition.split() ]
433 self._metadata = [ ColumnInfo(c[0], c[1]) for c in definition ]
434 if len(self._metadata) < 1:
435 raise ValueError()
436 except UnsupportedTypeError:
437 raise
438 except (TypeError, IndexError, ValueError, NameError, AttributeError):
439 raise ValueError('Invalid table definition')
440
441 self._imp.create(self._metadata)
442
443 @staticmethod
446 """
447 Table factory: build a L{Table} from a TSV file.
448
449 @param tsv: TSV path and filename. This can be either a conventional
450 TSV/CSV file, or a file created with C{table.dump(tsv)}
451 @type tsv: str
452 @param definition: table column definition (see L{Table}). If defined,
453 this parameter will determine the structure of the
454 table. Otherwise, the table definition will be
455 extracted from the TSV header. If the file contains
456 no TSV header, this parameter is mandatory.
457 @type definition: str, tuple of L{ColumnInfo}
458 @param delimiter: column separator used in the file
459 @type delimiter: str
460 @param skip: skip the first N number of rows (the header can still be
461 extracted from those however)
462 @type skip: int
463
464 @rtype: L{Table}
465
466 @raise ValueError: if neither a table C{definition} is provided,
467 nor the C{tsv} file has a header line
468 """
469
470 if not definition:
471 with open(tsv) as tsvfile:
472 for line in tsvfile:
473 if line.startswith(Table.HEADER):
474 definition = line[ len(Table.HEADER) : ]
475
476 if not definition:
477 raise ValueError('No header definition found')
478
479 table = Table(definition, name=name, backend=backend)
480
481 with open(tsv) as tsvfile:
482 for i, line in enumerate(tsvfile, start=1):
483 if (skip and i <= skip) or line.startswith(Table.HEADER):
484 continue
485 data = line.rstrip(os.linesep).split(delimiter)
486 table.insert(data)
487
488 return table
489
490 @staticmethod
492 """
493 Table factory: build a L{Table} from a 2D iterable/data reader.
494
495 @param iterable: data container
496 @type iterable: iterable (2D)
497 @param definition: table column definition (see L{Table}).
498 @type definition: str, tuple of L{ColumnInfo}
499
500 @rtype: L{Table}
501 """
502 table = Table(definition, name=name, backend=backend)
503
504 for row in iterable:
505 table.insert(list(row))
506
507 return table
508
509 @staticmethod
511 """
512 Table factory: build a L{Table} with the definition of another L{Table}.
513
514 @param table: template table
515 @type table: L{Table}
516 @param data: if True, also copy the data from the source C{table}
517 @type data: bool
518
519 @rtype: L{Table}
520 """
521 if data:
522 return Table.from_iterable(table, table._metadata, name=name, backend=backend)
523 else:
524 return Table(table._metadata, name=name, backend=backend)
525
526 @staticmethod
532
550
553
555 return self._imp.count()
556
564
566 import numpy
567 return numpy.array([ tuple(row) for row in self ])
568
574
586
588
589 exp = self._interpret(i)
590
591 if len(exp.select) != 1:
592 raise NotImplementedError('single-column expression expected')
593 if csb.core.iterable(value):
594 raise NotImplementedError("single-value assignment expected")
595
596 exp.data = value
597 self._update(exp)
598
607
609 """
610 Parse a table slice and convert it into an L{Expression}.
611 @rtype: L{Expression}
612 """
613
614 if not csb.core.iterable(i):
615 i = [i, slice(None, None)]
616 else:
617 i = list(i)
618
619 if len(i) not in (1, 2):
620 raise ValueError('Tables are only 2 dimensional')
621 if len(i) == 1:
622 i.append(slice(None, None))
623
624 exp = Expression(self.columns)
625 columns = self._getcols(i[1])
626 if len(columns) < 1:
627 raise ValueError('Column slices must return at least one column')
628 exp.select = columns
629 exp.where = self.pk
630
631 if isinstance(i[0], int):
632 self._checkrow(i[0])
633 if len(columns) == 1 and isinstance(i[1], (int, csb.core.string)):
634 exp.scalar = True
635 exp.predicate = Equals(i[0] + 1)
636
637 elif csb.core.iterable(i[0]):
638 params = list(i[0])
639 self._checkrow(params)
640 params = list(map(lambda x: x + 1, params))
641 exp.predicate = In(params)
642
643 elif isinstance(i[0], slice):
644
645 sl = i[0]
646 if sl.step is not None:
647 raise NotImplementedError('Row slice steps are not supported')
648
649 if sl == slice(None, None):
650 exp.where = None
651 elif sl.start is None:
652 self._checkrow(sl.stop)
653 exp.predicate = Lower(sl.stop + 1)
654 elif sl.stop is None:
655 self._checkrow(sl.start)
656 exp.predicate = GreaterOrEquals(sl.start + 1)
657 else:
658 self._checkrow([sl.start, sl.stop])
659 exp.predicate = Between(sl.start + 1, sl.stop)
660
661 else:
662 raise TypeError("Can't handle row slice expression: {0}".format(i[0]))
663
664 return exp
665
667
668 if isinstance(i, int):
669 if i < 0:
670 raise NotImplementedError('Negative row indices are not supported')
671 elif csb.core.iterable(i):
672 for j in i:
673 self._checkrow(j)
674 else:
675 raise TypeError(i)
676
678
679 columns = list(self.columns)
680
681 if spec is None and ifnull is not None:
682 return [ifnull]
683
684 elif isinstance(spec, int):
685 try:
686 return [columns[spec]]
687 except:
688 raise IndexError('Column {0} out of range'.format(spec))
689
690 elif isinstance(spec, csb.core.string):
691 if spec in columns:
692 return [spec]
693 else:
694 raise InvalidColumnError(spec)
695
696 elif isinstance(spec, slice):
697 start = self._getcols(spec.start, columns[0])
698 start = columns.index(start[0])
699
700 end = self._getcols(spec.stop, columns[-1])
701 end = columns.index(end[0])
702 if spec.stop is None:
703 end += 1
704
705 return [columns[i] for i in range(start, end, spec.step or 1)]
706
707 elif csb.core.iterable(spec):
708 return [self._getcols(i)[0] for i in spec]
709
710 else:
711 raise TypeError("Can't handle column slice expression: {0}".format(spec))
712
713 @property
715 """
716 Table name
717 @rtype: str
718 """
719 return self._name
720
721 @property
723 """
724 All column names
725 @rtype: tuple
726 """
727 return tuple(i.name for i in self._metadata)
728
729 @property
732
733 - def dump(self, file):
734 """
735 Dump the table in a file.
736
737 @param file: destination stream or filename
738 @type file: file (stream) or str (filename)
739 """
740
741 with csb.io.EntryWriter(file, close=False) as out:
742
743 definition = map(str, self._metadata)
744 out.write(Table.HEADER)
745 out.writeall(definition, delimiter=' ')
746 out.write(csb.io.NEWLINE)
747
748 for row in self:
749 out.writeline(row.dump(delimiter='\t'))
750
751 - def query(self, sql, params=None):
752 """
753 Execute a native SQL query against the storage engine.
754
755 @param sql: SQL query text. May contain parameter binding placeholders
756 (see L{Predicate.PH}). The SQL dialect of the query depends
757 on the SQL C{backend} being used by the table.
758
759 @return: native data reader
760 @rtype: iterable (2D)
761 """
762 return self._imp.query(sql, params)
763
765 """
766 Insert a new row in the table.
767
768 @param row: a tuple of the appropriate length
769 @type row: tuple
770 """
771 data = self._convert(row)
772 self._imp.insert(data)
773
775
776 metadata = dict((c.name, c) for c in self._metadata)
777 try:
778 return [metadata[cn].copy() for cn in columns]
779 except KeyError as ke:
780 raise InvalidColumnError(ke.message)
781 except:
782 raise
783
789
797
798 - def where(self, column):
799 """
800 @param column: column name
801 @type column: str
802 @raise InvalidColumnError: when an invalid column is requested
803 """
804 exp = Expression(self.columns)
805 return Where(self, exp, column)
806
808 """
809 @return: a new L{Table}
810
811 @param columns: column names; defaults to all columns
812 @type columns: str, tuple of str
813 @raise InvalidColumnError: when an invalid column is requested
814 """
815 columns = Expression.array(columns)
816
817 exp = Expression(self.columns)
818 exp.select = columns
819
820 return self._execute(exp)
821
822 - def update(self, column, value):
823 """
824 Update C{column} for all rows in the table.
825
826 @param column: column to update (name)
827 @type column: str
828 @param value: new column value
829 @raise InvalidColumnError: when an invalid column is referenced
830 """
831 exp = Expression(self.columns)
832 exp.select = [column]
833 exp.data = value
834
835 return self._update(exp)
836
837 - def scalar(self, row=None, column=None):
838 """
839 @return: a scalar value at the specified row and column.
840
841 @param row: row index; if not specified - take the first row
842 @type row: int
843 @param column: column name; if not specified - take the first
844 @type column: str
845
846 @raise IndexError: when an invalid row is requested
847 @raise InvalidColumnError: when an invalid column is requested
848 """
849
850 if row is None:
851 row = 0
852 row += 1
853 if column is None:
854 column = self.columns[0]
855 elif column not in self.columns:
856 raise InvalidColumnError(column)
857
858 exp = Expression(self.columns)
859 exp.select = [column]
860 exp.where = self.pk
861 exp.predicate = Equals([row])
862
863 reader = list(self._imp.execute(exp))
864 if len(reader) > 0:
865 return reader[0][0]
866 else:
867 raise IndexError()
868
870 """
871 @return: a list all values in the specified column
872
873 @param column: column to fetch
874 @type column: str
875 """
876 if column not in self.columns:
877 raise InvalidColumnError(column)
878
879 return [ row[column] for row in self ]
880
882 """
883 Metadata container: represents a table select or update expression.
884 """
885
887
888 self._table = None
889 self._columns = []
890
891 self._columns = list(columns)
892 self._select = []
893 self._where = None
894 self._predicate = None
895 self._data = None
896 self._scalar = False
897
898 self.select = '*'
899
900 @staticmethod
905
906 @property
908 return tuple(self._columns)
909
910 @property
916
917 @property
920 @select.setter
935
936 @property
939 @where.setter
941 if not value:
942 self._where = None
943 self._predicate = None
944 else:
945 self._where = value
946
947 @property
949 return self._predicate
950 @predicate.setter
952 if not value:
953 self._where = None
954 self._predicate = None
955 else:
956 self._predicate = value
957
958 @property
961 @data.setter
962 - def data(self, value):
964
965 @property
968 @scalar.setter
971
973
978
979 @property
982
983 @property
985 return self._expression
986
988
989 - def __init__(self, table, expression, column):
996
997 - def in_(self, *values):
999
1000 - def notin(self, *values):
1002
1005
1008
1011
1014
1015 - def lower(self, value):
1017
1019
1020 - def __init__(self, table, expression, predicate):
1024
1026 """
1027 @return: a new L{Table}
1028
1029 @param columns: column names; defaults to all columns
1030 @type columns: str, tuple of str
1031 @raise InvalidColumnError: when an invalid column is requested
1032 """
1033 exp = self.expression
1034 exp.select = columns
1035
1036 return self.table._execute(exp)
1037
1038 - def update(self, column, value):
1039 """
1040 Update C{column} for all rows in the table.
1041
1042 @param column: column to update (name)
1043 @type column: str
1044 @param value: new column value
1045 @raise InvalidColumnError: when an invalid column is referenced
1046 """
1047 exp = self.expression
1048 exp.select = [column]
1049 exp.data = value
1050
1051 return self.table._update(exp)
1052
1054
1055 __metaclass__ = ABCMeta
1056
1057 PH = '?'
1058
1073
1074 @property
1076 return tuple(self._params)
1077
1079
1080 if len(self._params) < 1:
1081 raise ValueError('{0} predicate with no params'.format(self.__class__.__name__))
1082
1083 @abstractproperty
1086
1089
1090 -class In(Predicate):
1096
1103
1109
1115
1121
1127
1133
1139
1147