1  """ 
   2  Read, query and update textual tables via flexible SQL interface. 
   3   
   4  L{Table}s can be created and populated with data from scratch, built from TSV 
   5  files, 2D lists or other tables. Once the data is loaded in memory, each 
   6  storage operation on the table object is delegated via bridge to an SQL 
   7  storage backend (by default this is SQLite). However the table uses the backend 
   8  only as a temp storage to ensure maximum portability of the data. Tables can be 
   9  stored persistently as text (TSV) files and then loaded back in memory when 
  10  needed. 
  11   
  12  These Tables can be queried and updated in a vast number of ways; each query 
  13  returns a new L{Table}:   
  14   
  15      1. Using slice expressions. The general form of a slice expression is 
  16      C{[rows, columns]}, where C{rows} can be: 
  17       
  18       
  19          - a row index, 0-based, e.g. C{5} 
  20          - a tuple of row indices, e.g. C{(1, 3, 6)} 
  21          - a standard Python slice, e.g. C{1:3} or C{:5} or C{:} 
  22          - omitted (means: all rows) 
  23           
  24      and C{columns} can be: 
  25       
  26          - a column index, 0-based, e.g. C{5}     
  27          - a tuple of columns indices, 0-based 
  28          - a column name, e.g. C{'TmScore'}         
  29          - a tuple of column names, e.g. C{('ID', 'TmScore')} 
  30          - a standard Python slice using column indices 
  31          - a slice using column names, e.g. C{'ID':'TM'} or C{:'TM'} or C{:} 
  32          - omitted (means: all columns) 
  33       
  34      2. Using query expressions, for example: 
  35       
  36       
  37      >>> table.where('ID').between(1, 5).select('TmScore', 'RMSD') 
  38      Table ('TmScore', 'RMSD') 
  39       
  40      >>> table.where('ID').between(1, 5).update('RMSD', 0.2) 
  41      Table (the same table) 
  42           
  43      3. With SQL queries: 
  44       
  45       
  46      >>> t.query(r'''SELECT  ColumnB * ColumnA AS ComputedValue 
  47                      FROM    {0.name} 
  48                      WHERE   ColumnC IN ({1}, {1})'''.format(t, Predicate.PH), 
  49                  [12, 55]) 
  50      iterable 
  51       
  52  The data contained in a Table can be extracted in several ways: 
  53   
  54      - if you need a single (scalar) value -- with the C{table[row, column]} 
  55      indexing expression or with the dedicated C{table.scalar(row, column)} method. 
  56      - by treating the table as an iterator; each cycle will then yield a L{DataRow} 
  57      object 
  58      - with text (TSV) serialization: simply call C{table.dump(file)}. 
  59       
  60  See L{Table} for full API details. 
  61  """ 
  62   
  63  try: 
  64      import __builtin__ as builtins 
  65  except ImportError: 
  66      import builtins 
  67   
  68  import os 
  69   
  70  import csb.io 
  71  import csb.core 
  72   
  73  from abc import ABCMeta, abstractmethod, abstractproperty 
  77      """ 
  78      Abstract SQL backend interface. Defines a number of platform-specific 
  79      operations, that each concrete backend implementor must provide.  
  80      """ 
  81       
  82      __metaclass__ = ABCMeta 
  83   
  85          self._table = tablename 
   86       
  87      @abstractproperty 
  90       
  91      @property 
  93          """ 
  94          Table name 
  95          @rtype: str 
  96          """ 
  97          return self._table     
   98       
  99 -    def query(self, sql, params=None): 
  100          """ 
 101          Execute a native SQL query against the backend, as-is. 
 102           
 103          @param sql: SQL query 
 104          @type sql: str 
 105          @param params: query bound parameters, if any 
 106          @type params: tuple 
 107           
 108          @return: data reader (2D iterable) 
 109          """ 
 110          raise NotImplementedError() 
  111       
 112      @abstractmethod 
 114          """ 
 115          Count the number of rows in the table. 
 116           
 117          @rtype: int 
 118          """ 
 119          pass     
  120       
 121      @abstractmethod 
 123          """ 
 124          Perform a select operation given L{expression}. 
 125           
 126          @type expression: L{Expression}     
 127          @return: data reader (2D iterable)         
 128          """ 
 129          pass 
  130       
 131      @abstractmethod 
 132 -    def update(self, expression): 
  133          """ 
 134          Perform an update operation given L{expression}. 
 135           
 136          @type expression: L{Expression}     
 137          @return: void         
 138          """ 
 139          pass 
  140       
 141      @abstractmethod 
 143          """ 
 144          Insert a new tuple in the table. 
 145           
 146          @type row: tuple     
 147          @return: void         
 148          """ 
 149          pass     
  150       
 151      @abstractmethod 
 153          """ 
 154          Create a table given L{metadata}. 
 155           
 156          @type metadata: tuple of L{ColumnInfo}     
 157          @return: void         
 158          """         
 159          pass 
  160       
 161      @abstractmethod 
 163          """ 
 164          Perform cleanup (e.g. close connections). 
 165          """ 
 166          pass  
   167   
 170   
 173   
 175      """ 
 176      SQLite-based concrete repository implementor. 
 177      This is the default L{Table} backend.  
 178      """ 
 179   
 180      PK = 'ROWID' 
 181      TYPES = { int: 'BIGINT', float: 'REAL', str: 'VARCHAR' } 
 182       
 184           
 185          SIZE = 10000 
 186   
 188              self._cursor = cursor 
  189               
 191              try: 
 192                  while True: 
 193                      rows = self._cursor.fetchmany(self.SIZE) 
 194                      if not rows: 
 195                          break 
 196                      else: 
 197                          for row in rows: 
 198                              yield row 
 199              finally:            
 200                  self._cursor.close() 
   201           
 209           
 210      @property 
 213       
 214 -    def _cursor(self, sql, params=None): 
  221               
 222 -    def query(self, sql, params=None): 
  223           
 224          return self._cursor(sql, params).fetchall() 
  225       
 227           
 228          query = 'SELECT  COUNT(*)\nFROM    {0}\n'.format(self.table) 
 229          return self._cursor(query).fetchone()[0]    
  230           
 243       
 256       
 263           
 265           
 266          cols = [] 
 267           
 268          for ci in metadata: 
 269              type = self._gettype(ci.type) 
 270              cols.append('{0}  {1}'.format(ci.name, type)) 
 271               
 272          statement = 'CREATE TABLE {0} (\n    {1}\n);'.format(self.table, ',\n    '.join(cols)) 
 273           
 274          self._conn.execute(statement) 
  275       
 281       
 283          try: 
 284              return self._conn.close() 
 285          except: 
 286              pass 
   287   
 289      """ 
 290      Holder object for column metadata. 
 291       
 292      @param name: column name 
 293      @type name: str 
 294      @param type: column data type (Python) 
 295      @type type: type 
 296      """ 
 297       
 301       
 302      @property 
 304          """ 
 305          Colulmn name 
 306          @rtype: str 
 307          """ 
 308          return self._name 
  309       
 310      @property 
 312          """ 
 313          Python data type 
 314          @rtype: class 
 315          """         
 316          return self._type     
  317       
 319          return '{0.name}:{0.type.__name__}'.format(self) 
  320       
 322          """ 
 323          @return: a deep copy of C{self} 
 324          """ 
 325          return ColumnInfo(self.name, self.type) 
   326       
 328      """ 
 329      Represents a table data row. This is basically what a table iterator 
 330      yields for each row in a table. Provides both index (position) and 
 331      column name-based access to the data. 
 332      """ 
 333       
 334 -    def __init__(self, columns, number, row): 
  335               
 336          self._number = number 
 337          self._row = tuple(row) 
 338          self._columns = {} 
 339          for i, c in enumerate(columns): 
 340              self._columns[c] = i 
 341          assert len(self._columns) == len(self._row)             
  342   
 344          return iter(self._row) 
  345           
 347           
 348          if isinstance(i, csb.core.string):             
 349              return self._row[self._columns[i]] 
 350          else: 
 351              return self._row[i]             
  352           
 354          return len(self._row) 
  355       
 357          return '{0}: {1}'.format(self.__class__.__name__, repr(self._row)) 
  358       
 361       
 362 -    def dump(self, delimiter='\t'): 
  363          """ 
 364          Dump the row as a string. 
 365           
 366          @param delimiter: column separator (defaults to tab) 
 367          @type delimiter: str  
 368          """ 
 369          return delimiter.join(map(Table._isnull, self._row)) 
  370       
 371      @property 
 373          """ 
 374          Available column names 
 375          @rtype: tuple 
 376          """ 
 377          c = dict((self._columns[cn], cn) for cn in self._columns) 
 378          return tuple(c[i] for i in sorted(c)) 
  379       
 380      @property 
 382          """ 
 383          Row number 
 384          @rtype: int 
 385          """ 
 386          return self._number 
   387                   
 389      """ 
 390      Build and query a TSV Table. See the documentation of L{csb.io.tsv} for 
 391      details and examples. 
 392       
 393      @param definition: column definition string: C{ColA:typeA colB:typeB ...}, 
 394                         where C{ColN} is a column name and C{typeN} is one of the 
 395                         base Python data types: str, int, long, float. 
 396                         Alternatively, the table definition may be specified 
 397                         directly as a list of metadata objects. 
 398      @type definition: str, tuple of L{ColumnInfo} 
 399      @param name: name of the table on the SQL backend. Useful when you need to 
 400                   execute native SQL queries against the table. 
 401      @type name: str 
 402      @param backend: table backend storage engine. This must be a proper 
 403                      L{RepositoryImp} bridge implementor. 
 404      @type backend: type (reference to a L{RepositoryImp} subclass) 
 405       
 406      @raise UnsupportedTypeError: when an unsupported type is used in the table 
 407                                   C{definition} 
 408      @raise ValueError: if the C{definition} is not valid 
 409      """ 
 410       
 411      """ 
 412      Table header string, used when saving and restoring TSV files. 
 413      """ 
 414      HEADER = '# @TSV ' 
 415      NULL = '' 
 416       
 418           
 419          if not issubclass(backend, RepositoryImp): 
 420              raise TypeError('The Table Backend must be a Repository Implementor') 
 421   
 422          self._name = name 
 423          self._backend = backend 
 424          self._imp = backend(name) 
 425          self._metadata = [] 
 426           
 427          try: 
 428              if isinstance(definition[0], ColumnInfo): 
 429                  self._metadata = [ c.copy() for c in definition ] 
 430              else: 
 431                  if isinstance(definition, csb.core.string): 
 432                      definition = [ (d.split(':')[0], getattr(builtins, d.split(':')[1])) for d in definition.split() ] 
 433                  self._metadata = [ ColumnInfo(c[0], c[1]) for c in definition ] 
 434              if len(self._metadata) < 1: 
 435                  raise ValueError() 
 436          except UnsupportedTypeError: 
 437              raise             
 438          except (TypeError, IndexError, ValueError, NameError, AttributeError): 
 439              raise ValueError('Invalid table definition') 
 440           
 441          self._imp.create(self._metadata) 
  442           
 443      @staticmethod 
 446          """ 
 447          Table factory: build a L{Table} from a TSV file. 
 448           
 449          @param tsv: TSV path and filename. This can be either a conventional 
 450                      TSV/CSV file, or a file created with C{table.dump(tsv)} 
 451          @type tsv: str 
 452          @param definition: table column definition (see L{Table}). If defined, 
 453                             this parameter will determine the structure of the 
 454                             table. Otherwise, the table definition will be 
 455                             extracted from the TSV header. If the file contains 
 456                             no TSV header, this parameter is mandatory. 
 457          @type definition: str, tuple of L{ColumnInfo}                            
 458          @param delimiter: column separator used in the file 
 459          @type delimiter: str 
 460          @param skip: skip the first N number of rows (the header can still be 
 461                       extracted from those however) 
 462          @type skip: int 
 463           
 464          @rtype: L{Table} 
 465           
 466          @raise ValueError: if neither a table C{definition} is provided, 
 467                             nor the C{tsv} file has a header line  
 468          """ 
 469   
 470          if not definition: 
 471              with open(tsv) as tsvfile: 
 472                  for line in tsvfile: 
 473                      if line.startswith(Table.HEADER): 
 474                          definition = line[ len(Table.HEADER) : ] 
 475                           
 476          if not definition: 
 477              raise ValueError('No header definition found') 
 478                       
 479          table = Table(definition, name=name, backend=backend)  
 480           
 481          with open(tsv) as tsvfile: 
 482              for i, line in enumerate(tsvfile, start=1): 
 483                  if (skip and i <= skip) or line.startswith(Table.HEADER): 
 484                      continue 
 485                  data = line.rstrip(os.linesep).split(delimiter) 
 486                  table.insert(data) 
 487           
 488          return table 
  489       
 490      @staticmethod 
 492          """ 
 493          Table factory: build a L{Table} from a 2D iterable/data reader. 
 494           
 495          @param iterable: data container 
 496          @type iterable: iterable (2D) 
 497          @param definition: table column definition (see L{Table}). 
 498          @type definition: str, tuple of L{ColumnInfo} 
 499   
 500          @rtype: L{Table} 
 501          """         
 502          table = Table(definition, name=name, backend=backend) 
 503            
 504          for row in iterable: 
 505              table.insert(list(row)) 
 506           
 507          return table 
  508       
 509      @staticmethod 
 511          """ 
 512          Table factory: build a L{Table} with the definition of another L{Table}. 
 513           
 514          @param table: template table 
 515          @type table: L{Table} 
 516          @param data: if True, also copy the data from the source C{table} 
 517          @type data: bool  
 518   
 519          @rtype: L{Table} 
 520          """ 
 521          if data: 
 522              return Table.from_iterable(table, table._metadata, name=name, backend=backend) 
 523          else: 
 524              return Table(table._metadata, name=name, backend=backend)             
  525   
 526      @staticmethod 
 532           
 550                   
 553           
 555          return self._imp.count() 
  556           
 564               
 566          import numpy 
 567          return numpy.array([ tuple(row) for row in self ]) 
  568               
 574           
 586               
 588   
 589          exp = self._interpret(i) 
 590           
 591          if len(exp.select) != 1: 
 592              raise NotImplementedError('single-column expression expected') 
 593          if csb.core.iterable(value): 
 594              raise NotImplementedError("single-value assignment expected")      
 595           
 596          exp.data = value 
 597          self._update(exp) 
  598               
 607       
 609          """ 
 610          Parse a table slice and convert it into an L{Expression}. 
 611          @rtype: L{Expression} 
 612          """ 
 613           
 614          if not csb.core.iterable(i): 
 615              i = [i, slice(None, None)]         
 616          else: 
 617              i = list(i) 
 618               
 619          if len(i) not in (1, 2): 
 620              raise ValueError('Tables are only 2 dimensional') 
 621          if len(i) == 1: 
 622              i.append(slice(None, None)) 
 623           
 624          exp = Expression(self.columns) 
 625          columns = self._getcols(i[1]) 
 626          if len(columns) < 1: 
 627              raise ValueError('Column slices must return at least one column') 
 628          exp.select = columns 
 629          exp.where = self.pk 
 630           
 631          if isinstance(i[0], int): 
 632              self._checkrow(i[0]) 
 633              if len(columns) == 1 and isinstance(i[1], (int, csb.core.string)): 
 634                  exp.scalar = True 
 635              exp.predicate = Equals(i[0] + 1) 
 636               
 637          elif csb.core.iterable(i[0]): 
 638              params = list(i[0]) 
 639              self._checkrow(params) 
 640              params = list(map(lambda x: x + 1, params)) 
 641              exp.predicate = In(params) 
 642               
 643          elif isinstance(i[0], slice): 
 644               
 645              sl = i[0] 
 646              if sl.step is not None: 
 647                  raise NotImplementedError('Row slice steps are not supported')               
 648               
 649              if sl == slice(None, None): 
 650                  exp.where = None 
 651              elif sl.start is None: 
 652                  self._checkrow(sl.stop)                 
 653                  exp.predicate = Lower(sl.stop + 1) 
 654              elif sl.stop is None: 
 655                  self._checkrow(sl.start)                 
 656                  exp.predicate = GreaterOrEquals(sl.start + 1) 
 657              else: 
 658                  self._checkrow([sl.start, sl.stop])        
 659                  exp.predicate = Between(sl.start + 1, sl.stop) 
 660                   
 661          else: 
 662              raise TypeError("Can't handle row slice expression: {0}".format(i[0])) 
 663           
 664          return exp 
  665           
 667           
 668          if isinstance(i, int): 
 669              if i < 0: 
 670                  raise NotImplementedError('Negative row indices are not supported') 
 671          elif csb.core.iterable(i): 
 672              for j in i: 
 673                  self._checkrow(j) 
 674          else: 
 675              raise TypeError(i) 
  676                
 678           
 679          columns = list(self.columns) 
 680           
 681          if spec is None and ifnull is not None: 
 682              return [ifnull] 
 683           
 684          elif isinstance(spec, int): 
 685              try: 
 686                  return [columns[spec]] 
 687              except: 
 688                  raise IndexError('Column {0} out of range'.format(spec)) 
 689               
 690          elif isinstance(spec, csb.core.string): 
 691              if spec in columns: 
 692                  return [spec] 
 693              else: 
 694                  raise InvalidColumnError(spec) 
 695           
 696          elif isinstance(spec, slice): 
 697              start = self._getcols(spec.start, columns[0]) 
 698              start = columns.index(start[0]) 
 699   
 700              end = self._getcols(spec.stop, columns[-1]) 
 701              end = columns.index(end[0]) 
 702              if spec.stop is None: 
 703                  end += 1 
 704                   
 705              return [columns[i] for i in range(start, end, spec.step or 1)] 
 706           
 707          elif csb.core.iterable(spec): 
 708              return [self._getcols(i)[0] for i in spec] 
 709           
 710          else: 
 711              raise TypeError("Can't handle column slice expression: {0}".format(spec)) 
  712   
 713      @property 
 715          """ 
 716          Table name 
 717          @rtype: str 
 718          """ 
 719          return self._name 
  720   
 721      @property 
 723          """ 
 724          All column names 
 725          @rtype: tuple 
 726          """ 
 727          return tuple(i.name for i in self._metadata) 
  728       
 729      @property 
 732       
 733 -    def dump(self, file): 
  734          """ 
 735          Dump the table in a file. 
 736           
 737          @param file: destination stream or filename 
 738          @type file: file (stream) or str (filename) 
 739          """ 
 740                   
 741          with csb.io.EntryWriter(file, close=False) as out: 
 742               
 743              definition = map(str, self._metadata) 
 744              out.write(Table.HEADER) 
 745              out.writeall(definition, delimiter=' ') 
 746              out.write(csb.io.NEWLINE) 
 747               
 748              for row in self: 
 749                  out.writeline(row.dump(delimiter='\t')) 
  750       
 751 -    def query(self, sql, params=None): 
  752          """ 
 753          Execute a native SQL query against the storage engine. 
 754           
 755          @param sql: SQL query text. May contain parameter binding placeholders 
 756                      (see L{Predicate.PH}). The SQL dialect of the query depends 
 757                      on the SQL C{backend} being used by the table. 
 758           
 759          @return: native data reader 
 760          @rtype: iterable (2D)         
 761          """ 
 762          return self._imp.query(sql, params) 
  763       
 765          """ 
 766          Insert a new row in the table. 
 767           
 768          @param row: a tuple of the appropriate length 
 769          @type row: tuple  
 770          """ 
 771          data = self._convert(row) 
 772          self._imp.insert(data) 
  773       
 775           
 776          metadata = dict((c.name, c) for c in self._metadata) 
 777          try: 
 778              return [metadata[cn].copy() for cn in columns] 
 779          except KeyError as ke: 
 780              raise InvalidColumnError(ke.message) 
 781          except: 
 782              raise 
  783           
 789       
 797           
 798 -    def where(self, column): 
  799          """ 
 800          @param column: column name         
 801          @type column: str 
 802          @raise InvalidColumnError: when an invalid column is requested 
 803          """ 
 804          exp = Expression(self.columns) 
 805          return Where(self, exp, column) 
  806       
 808          """ 
 809          @return: a new L{Table} 
 810           
 811          @param columns: column names; defaults to all columns 
 812          @type columns: str, tuple of str 
 813          @raise InvalidColumnError: when an invalid column is requested 
 814          """         
 815          columns = Expression.array(columns) 
 816           
 817          exp = Expression(self.columns) 
 818          exp.select = columns 
 819           
 820          return self._execute(exp) 
  821       
 822 -    def update(self, column, value): 
  823          """ 
 824          Update C{column} for all rows in the table. 
 825           
 826          @param column: column to update (name)         
 827          @type column: str 
 828          @param value: new column value 
 829          @raise InvalidColumnError: when an invalid column is referenced         
 830          """           
 831          exp = Expression(self.columns) 
 832          exp.select = [column] 
 833          exp.data = value 
 834           
 835          return self._update(exp) 
  836       
 837 -    def scalar(self, row=None, column=None): 
  838          """ 
 839          @return: a scalar value at the specified row and column. 
 840   
 841          @param row: row index; if not specified - take the first row 
 842          @type row: int 
 843          @param column: column name; if not specified - take the first 
 844          @type column: str 
 845           
 846          @raise IndexError: when an invalid row is requested 
 847          @raise InvalidColumnError: when an invalid column is requested         
 848          """ 
 849           
 850          if row is None: 
 851              row = 0 
 852          row += 1 
 853          if column is None: 
 854              column = self.columns[0] 
 855          elif column not in self.columns: 
 856              raise InvalidColumnError(column) 
 857               
 858          exp = Expression(self.columns) 
 859          exp.select = [column] 
 860          exp.where = self.pk 
 861          exp.predicate = Equals([row]) 
 862           
 863          reader = list(self._imp.execute(exp)) 
 864          if len(reader) > 0: 
 865              return reader[0][0] 
 866          else: 
 867              raise IndexError() 
  868           
 870          """ 
 871          @return: a list all values in the specified column  
 872           
 873          @param column: column to fetch         
 874          @type column: str 
 875          """ 
 876          if column not in self.columns: 
 877              raise InvalidColumnError(column) 
 878           
 879          return [ row[column] for row in self ] 
   880   
 882      """ 
 883      Metadata container: represents a table select or update expression. 
 884      """ 
 885       
 887           
 888          self._table = None 
 889          self._columns = [] 
 890           
 891          self._columns = list(columns) 
 892          self._select = [] 
 893          self._where = None 
 894          self._predicate = None 
 895          self._data = None 
 896          self._scalar = False 
 897           
 898          self.select = '*' 
  899       
 900      @staticmethod 
 905       
 906      @property 
 908          return tuple(self._columns) 
  909       
 910      @property 
 916           
 917      @property 
 920      @select.setter 
 935               
 936      @property 
 939      @where.setter 
 941          if not value: 
 942              self._where = None 
 943              self._predicate = None 
 944          else: 
 945              self._where = value 
  946       
 947      @property 
 949          return self._predicate 
  950      @predicate.setter 
 952          if not value: 
 953              self._where = None 
 954              self._predicate = None 
 955          else: 
 956              self._predicate = value 
  957   
 958      @property 
 961      @data.setter 
 962 -    def data(self, value): 
  964   
 965      @property 
 968      @scalar.setter 
  971         
 973       
 978           
 979      @property 
 982       
 983      @property 
 985          return self._expression       
   986       
 988       
 989 -    def __init__(self, table, expression, column): 
  996       
 997 -    def in_(self, *values): 
  999   
1000 -    def notin(self, *values): 
 1002       
1005       
1008   
1011           
1014   
1015 -    def lower(self, value): 
  1017           
1019   
1020 -    def __init__(self, table, expression, predicate): 
 1024           
1026          """ 
1027          @return: a new L{Table} 
1028           
1029          @param columns: column names; defaults to all columns 
1030          @type columns: str, tuple of str 
1031          @raise InvalidColumnError: when an invalid column is requested 
1032          """         
1033          exp = self.expression 
1034          exp.select = columns 
1035   
1036          return self.table._execute(exp) 
 1037       
1038 -    def update(self, column, value): 
 1039          """ 
1040          Update C{column} for all rows in the table. 
1041           
1042          @param column: column to update (name)         
1043          @type column: str 
1044          @param value: new column value 
1045          @raise InvalidColumnError: when an invalid column is referenced 
1046          """         
1047          exp = self.expression 
1048          exp.select = [column] 
1049          exp.data = value 
1050           
1051          return self.table._update(exp) 
  1052       
1054       
1055      __metaclass__ = ABCMeta 
1056       
1057      PH = '?' 
1058       
1073   
1074      @property 
1076          return tuple(self._params) 
 1077       
1079           
1080          if len(self._params) < 1: 
1081              raise ValueError('{0} predicate with no params'.format(self.__class__.__name__)) 
 1082       
1083      @abstractproperty 
1086       
 1089       
1090 -class In(Predicate): 
 1096   
1103       
1109       
1115       
1121   
1127   
1133   
1139               
1147