mongo-python-driver/gridfs/grid_file.py
2009-07-08 11:43:25 -04:00

309 lines
11 KiB
Python

# Copyright 2009 10gen, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""File-like object used for reading from and writing to GridFS"""
import types
import datetime
import math
from threading import Condition
from pymongo.son import SON
from pymongo.database import Database
from pymongo.objectid import ObjectId
from pymongo.dbref import DBRef
from pymongo.binary import Binary
from errors import CorruptGridFile
from pymongo import ASCENDING
# TODO we should use per-file reader-writer locks here instead,
# for performance. Unfortunately they aren't in the Python standard library.
_files_lock = Condition()
_open_files = {}
class GridFile(object):
"""A "file" stored in GridFS.
"""
# TODO should be able to create a GridFile given a Collection object instead
# of a database and collection name?
# TODO this whole file_spec thing is over-engineered. ought to be just
# filename.
def __init__(self, file_spec, database, mode="r", collection="fs"):
"""Open a "file" in GridFS.
Application developers should generally not need to instantiate this
class directly - instead see the `gridfs.open` method.
Only a single opened GridFile instance may exist for a file in gridfs
at any time. Care must be taken to close GridFile instances when done
using them.
Raises TypeError if file_spec is not an instance of dict, database is
not an instance of `pymongo.database.Database`, or collection is not an
instance of (str, unicode).
The file_spec argument must be a SON query specifier for the file to
open. The *first* file matching the specifier will be opened. If no such
files exist, a new file is created using the metadata in file_spec.
The valid fields in a file_spec are as follows:
- "_id": unique ID for this file
* default: `pymongo.objectid.ObjectId()`
- "filename": human name for the file
- "contentType": valid mime-type for the file
- "length": size of the file, in bytes
* only used for querying, automatically set for inserts
- "chunkSize": size of each of the chunks, in bytes
* default: 256 kb
- "uploadDate": date when the object was first stored
* only used for querying, automatically set for inserts
- "aliases": array of alias strings
- "metadata": a SON document containing arbitrary data
:Parameters:
- `file_spec`: query specifier as described above
- `database`: the database to store/retrieve this file in
- `mode` (optional): the mode to open this file with, one of
("r", "w")
- `collection` (optional): the collection in which to store/retrieve
this file
"""
if not isinstance(file_spec, types.DictType):
raise TypeError("file_spec must be an instance of (dict, SON)")
if not isinstance(database, Database):
raise TypeError("database must be an instance of database")
if not isinstance(collection, types.StringTypes):
raise TypeError("collection must be an instance of (str, unicode)")
if not isinstance(mode, types.StringTypes):
raise TypeError("mode must be an instance of (str, unicode)")
if mode not in ("r", "w"):
raise ValueError("mode must be one of ('r', 'w')")
self.__collection = database[collection]
self.__collection.chunks.ensure_index([("files_id", ASCENDING), ("n", ASCENDING)])
_files_lock.acquire()
grid_file = self.__collection.files.find_one(file_spec)
if grid_file:
self.__id = grid_file["_id"]
else:
if mode == "r":
_files_lock.release()
raise IOError("No such file: %r" % file_spec)
file_spec["length"] = 0
file_spec["uploadDate"] = datetime.datetime.utcnow()
file_spec.setdefault("chunkSize", 256000)
self.__id = self.__collection.files.insert(file_spec)["_id"]
# we use repr(self.__id) here because we need it to be string and
# filename gets tricky with renaming. this is a hack.
while repr(self.__id) in _open_files:
_files_lock.wait()
_open_files[repr(self.__id)] = True
_files_lock.release()
self.__mode = mode
if mode == "w":
self.__erase()
self.__buffer = ""
self.__position = 0
self.__chunk_number = 0
self.__closed = False
def __erase(self):
"""Erase all of the data stored in this GridFile.
"""
grid_file = self.__collection.files.find_one({"_id": self.__id})
grid_file["next"] = None
grid_file["length"] = 0
self.__collection.files.save(grid_file)
self.__collection.chunks.remove({"files_id": self.__id})
def closed(self):
return self.__closed
closed = property(closed)
def mode(self):
return self.__mode
mode = property(mode)
def __create_property(field_name, read_only=False):
def getter(self):
return self.__collection.files.find_one({"_id": self.__id}).get(field_name, None)
def setter(self, value):
grid_file = self.__collection.files.find_one({"_id": self.__id})
grid_file[field_name] = value
self.__collection.files.save(grid_file)
if not read_only:
return property(getter, setter)
return property(getter)
name = __create_property("filename", True)
content_type = __create_property("contentType")
length = __create_property("length", True)
chunk_size = __create_property("chunkSize", True)
upload_date = __create_property("uploadDate", True)
aliases = __create_property("aliases")
metadata = __create_property("metadata")
md5 = __create_property("md5", True)
def rename(self, filename):
"""Rename this GridFile.
Due to buffering, the rename might not actually occur until `flush()` or
`close()` is called.
:Parameters:
- `filename`: the new name for this GridFile
"""
grid_file = self.__collection.files.find_one({"_id": self.__id})
grid_file["filename"] = filename
self.__collection.files.save(grid_file)
def __max_chunk(self):
return self.__collection.chunks.find_one({"files_id": self.__id, "n": self.__chunk_number})
def __new_chunk(self, n):
chunk = {"files_id": self.__id,
"n": n,
"data": ""}
self.__collection.chunks.insert(chunk)
return chunk
def __write_buffer_to_chunks(self):
"""Write the buffer contents out to chunks.
"""
while len(self.__buffer):
max_chunk = self.__max_chunk()
if not max_chunk:
max_chunk = self.__new_chunk(self.__chunk_number)
space = (self.__chunk_number + 1) * self.chunk_size - self.__position
if not space:
self.__chunk_number += 1
max_chunk = self.__new_chunk(self.__chunk_number)
space = self.chunk_size
to_write = len(self.__buffer) > space and space or len(self.__buffer)
max_chunk["data"] = Binary(max_chunk["data"] + self.__buffer[:to_write])
self.__collection.chunks.save(max_chunk)
self.__buffer = self.__buffer[to_write:]
self.__position += to_write
def flush(self):
"""Flush the GridFile to the database.
"""
self.__assert_open()
if self.mode != "w":
return
self.__write_buffer_to_chunks()
md5 = self.__collection.database()._command(SON([("filemd5", self.__id),
("root", self.__collection.name())]))["md5"]
grid_file = self.__collection.files.find_one({"_id": self.__id})
grid_file["md5"] = md5
grid_file["length"] = self.__position + len(self.__buffer)
self.__collection.files.save(grid_file)
def close(self):
"""Close the GridFile.
A closed GridFile cannot be read or written any more. Calling `close()`
more than once is allowed.
"""
if not self.__closed:
self.flush()
self.__closed = True
_files_lock.acquire()
if repr(self.__id) in _open_files:
del _open_files[repr(self.__id)]
_files_lock.notifyAll()
_files_lock.release()
def __assert_open(self, mode=None):
if mode and self.mode != mode:
raise ValueError("file must be open in mode %r" % mode)
if self.closed:
raise ValueError("operation cannot be performed on a closed GridFile")
def read(self, size=-1):
"""Read at most size bytes from the file (less if there isn't enough
data).
The bytes are returned as a string object. If size is negative or omitted
all data is read. Raises ValueError if this GridFile is already closed.
:Parameters:
- `size` (optional): the number of bytes to read
"""
self.__assert_open("r")
if size == 0:
return ""
remainder = int(self.length) - self.__position
if size < 0 or size > remainder:
size = remainder
bytes = self.__buffer
chunk_number = math.floor(self.__position / self.chunk_size)
while len(bytes) < size:
chunk = self.__collection.chunks.find_one({"files_id": self.__id, "n": chunk_number})
if not chunk:
raise CorruptGridFile("no chunk for n = " + chunk_number)
bytes += chunk["data"]
chunk_number += 1
self.__position += size
to_return = bytes[:size]
self.__buffer = bytes[size:]
return to_return
# TODO should support writing unicode to a file. this means that files will
# need to have an encoding attribute.
def write(self, str):
"""Write a string to the GridFile. There is no return value.
Due to buffering, the string may not actually show up in the database
until the `flush()` or `close()` method is called. Raises ValueError if
this GridFile is already closed. Raises TypeErrer if str is not an
instance of str.
:Parameters:
- `str`: string of bytes to be written to the file
"""
self.__assert_open("w")
if not isinstance(str, types.StringType):
raise TypeError("can only write strings")
if not len(str):
return
self.__buffer += str
def writelines(self, sequence):
"""Write a sequence of strings to the file.
Does not add seperators.
"""
for line in sequence:
self.write(line)