[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Bug#704594: ITP: python-arpy -- library for accessing the archive files and reading the contents



On Wed, Apr 03, 2013 at 04:27:43PM +0200, Helmut Grohne wrote:
> So maybe we can work on a solution here? I'll try to patch arpy to
> support this use case. Give me a week?

Turned out to be easier than expected. Maybe less arguing and more
fixing would have helped here. The attached patch makes reading from
stdin possible. It has one major drawback the moment. When your backing
fileobj does support seeking it still emulates forward seeks using
reads, because there is no way to reliably detect the availability of
seek. So accessing random members of an Archive could result in very bad
performance (especially if combined with bz2). In order to fix that the
user has to provide some indication whether seek is desired. When using
Archive without seek only the current ArchiveFileData may be read from
start till end once. To ease working with the Archive class I turned it
into an iterator yielding ArchiveFileDatas. When using this iterator
forward seeks are almost never necessary. You can omit it, but I still
think it to be useful.

Helmut
diff -r 67ef59afde76 arpy.py
--- a/arpy.py	Sun Mar 24 01:52:07 2013 +0000
+++ b/arpy.py	Wed Apr 03 17:40:11 2013 +0200
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 #
 # Copyright 2011 StanisÅ?aw Pitucha. All rights reserved.
+# Copyright 2013 Helmut Grohne. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without modification, are
 # permitted provided that the following conditions are met:
@@ -127,12 +128,12 @@
 class ArchiveFileData(object):
 	""" File-like object used for reading an archived file """
 
-	def __init__(self, file_obj, header):
+	def __init__(self, ar_obj, header):
 		"""
 		Creates a new proxy for the archived file, reusing the archive's file descriptor
 		"""
 		self.header = header
-		self.file = file_obj
+		self.arobj = ar_obj
 		self.last_offset = 0
 
 	def read(self, size = None):
@@ -143,8 +144,8 @@
 		if self.header.size < self.last_offset + size:
 			size = self.header.size - self.last_offset
 
-		self.file.seek(self.header.file_offset + self.last_offset)
-		data = self.file.read(size)
+		self.arobj._seek(self.header.file_offset + self.last_offset)
+		data = self.arobj._read(size)
 		if len(data) < size:
 			raise ArchiveAccessError("incorrect archive file")
 
@@ -175,18 +176,38 @@
 	def __init__(self, filename=None, fileobj=None):
 		self.headers = []
 		self.file = fileobj or open(filename, "rb")
-		if self.file.read(GLOBAL_HEADER_LEN) != b"!<arch>\n":
+		self.position = 0
+		if self._read(GLOBAL_HEADER_LEN) != b"!<arch>\n":
 			raise ArchiveFormatError("file is missing the global header")
 		
 		self.next_header_offset = GLOBAL_HEADER_LEN
 		self.gnu_table = None
 		self.archived_files = {}
 
+	def _read(self, length):
+		data = self.file.read(length)
+		self.position += len(data)
+		return data
+
+	def _seek(self, offset):
+		if offset < 0:
+			raise ArchiveAccessError("incorrect file position")
+		if offset < self.position:
+			# seek required, might fail
+			self.file.seek(offset)
+			self.position = self.file.tell()
+		else:
+			# emulate seek
+			while self.position < offset:
+				if not self._read(min(4096, offset - self.position)):
+					# reached EOF before target offset
+					return
+
 	def __read_file_header(self, offset):
 		""" Reads and returns a single new file header """
-		self.file.seek(offset)
+		self._seek(offset)
 
-		header = self.file.read(HEADER_LEN)
+		header = self._read(HEADER_LEN)
 
 		if len(header) == 0:
 			return None
@@ -208,7 +229,7 @@
 
 	def __read_gnu_table(self, size):
 		""" Reads the table of filenames specific to GNU ar format """
-		table_string = self.file.read(size)
+		table_string = self._read(size)
 		if len(table_string) != size:
 			raise ArchiveFormatError("file too short to fit the names table")
 
@@ -234,8 +255,8 @@
 			# BSD format includes the filename in the file size
 			header.size -= filename_len
 
-			self.file.seek(header.offset + HEADER_LEN)
-			header.name = self.file.read(filename_len)
+			self._seek(header.offset + HEADER_LEN)
+			header.name = self._read(filename_len)
 			return filename_len
 
 		elif header.type == HEADER_GNU_TABLE:
@@ -274,10 +295,22 @@
 		if header is not None:
 			self.headers.append(header)
 			if header.type in (HEADER_BSD, HEADER_NORMAL, HEADER_GNU):
-				self.archived_files[header.name] = ArchiveFileData(self.file, header)
+				self.archived_files[header.name] = ArchiveFileData(self, header)
 
 		return header
 
+	def __next__(self):
+		while True:
+			header = self.read_next_header()
+			if header is None:
+				raise StopIteration
+			if header.type in (HEADER_BSD, HEADER_NORMAL, HEADER_GNU):
+				return self.archived_files[header.name]
+	next = __next__
+
+	def __iter__(self):
+		return self
+
 	def read_all_headers(self):
 		""" Reads all headers """
 		while self.read_next_header() is not None:

Reply to: