Bug#988771: unblock: mat2/0.12.1-1
Package: release.debian.org
Severity: normal
User: release.debian.org@packages.debian.org
Usertags: unblock
Dear release team,
Please unblock mat2 0.12.1-1.
It ships improved support of EPUB and Microsoft Office files. It's a key
package, as doxygen build-depends on it, but so far regressions haven't been
reported, and autopkgtest looks good as well. The diff is quite small:
~ debdiff mat2_0.12.0-1.dsc mat2_0.12.1-1.dsc | diffstat
CHANGELOG.md | 5 +++++
debian/changelog | 7 +++++++
doc/mat2.1 | 2 +-
libmat2/epub.py | 49 +++++++++++++++++++++++++++++++++++++++++++------
libmat2/office.py | 2 ++
mat2 | 2 +-
setup.py | 2 +-
7 files changed, 60 insertions(+), 9 deletions(-)
Please find the full debdiff attached.
unblock mat2/0.12.1-1
Thanks for your work,
cheers,
Georg
diff -Nru mat2-0.12.0/CHANGELOG.md mat2-0.12.1/CHANGELOG.md
--- mat2-0.12.0/CHANGELOG.md 2020-12-18 16:55:41.000000000 +0000
+++ mat2-0.12.1/CHANGELOG.md 2021-03-19 16:54:21.000000000 +0000
@@ -1,3 +1,8 @@
+# 0.12.1 - 2021-03-19
+
+- Improve epub support
+- Improve MS Office support
+
# 0.12.0 - 2020-12-18
- Improve significantly MS Office formats support
diff -Nru mat2-0.12.0/debian/changelog mat2-0.12.1/debian/changelog
--- mat2-0.12.0/debian/changelog 2020-12-26 19:52:55.000000000 +0000
+++ mat2-0.12.1/debian/changelog 2021-03-20 19:11:38.000000000 +0000
@@ -1,3 +1,10 @@
+mat2 (0.12.1-1) unstable; urgency=medium
+
+ * New upstream version 0.12.1:
+ - Ships improved support of EPUB and Microsoft Office files.
+
+ -- Georg Faerber <georg@debian.org> Sat, 20 Mar 2021 19:11:38 +0000
+
mat2 (0.12.0-1) unstable; urgency=medium
* Team upload.
diff -Nru mat2-0.12.0/doc/mat2.1 mat2-0.12.1/doc/mat2.1
--- mat2-0.12.0/doc/mat2.1 2020-12-18 16:55:41.000000000 +0000
+++ mat2-0.12.1/doc/mat2.1 2021-03-19 16:54:21.000000000 +0000
@@ -1,4 +1,4 @@
-.TH mat2 "1" "December 2020" "mat2 0.12.0" "User Commands"
+.TH mat2 "1" "March 2021" "mat2 0.12.1" "User Commands"
.SH NAME
mat2 \- the metadata anonymisation toolkit 2
diff -Nru mat2-0.12.0/libmat2/epub.py mat2-0.12.1/libmat2/epub.py
--- mat2-0.12.0/libmat2/epub.py 2020-12-18 16:55:41.000000000 +0000
+++ mat2-0.12.1/libmat2/epub.py 2021-03-19 16:54:21.000000000 +0000
@@ -1,7 +1,9 @@
import logging
import re
import uuid
+import zipfile
import xml.etree.ElementTree as ET # type: ignore
+from typing import Dict, Any
from . import archive, office
@@ -15,11 +17,28 @@
'META-INF/container.xml',
'mimetype',
'OEBPS/content.opf',
+ 'content.opf',
+ 'hmh.opf',
+ 'OPS/.+.xml'
}))
+ self.files_to_omit = set(map(re.compile, { # type: ignore
+ 'iTunesMetadata.plist',
+ 'META-INF/calibre_bookmarks.txt',
+ 'OEBPS/package.opf',
+ }))
self.uniqid = uuid.uuid4()
- def _specific_get_meta(self, full_path, file_path):
- if file_path != 'OEBPS/content.opf':
+
+ def is_archive_valid(self):
+ super().is_archive_valid()
+ with zipfile.ZipFile(self.filename) as zin:
+ for item in self._get_all_members(zin):
+ member_name = self._get_member_name(item)
+ if member_name.endswith('META-INF/encryption.xml'):
+ raise ValueError('the file contains encrypted fonts')
+
+ def _specific_get_meta(self, full_path, file_path) -> Dict[str, Any]:
+ if not file_path.endswith('.opf'):
return {}
with open(full_path, encoding='utf-8') as f:
@@ -30,14 +49,32 @@
except (TypeError, UnicodeDecodeError):
return {file_path: 'harmful content', }
- def _specific_cleanup(self, full_path: str):
- if full_path.endswith('OEBPS/content.opf'):
+ def _specific_cleanup(self, full_path: str) -> bool:
+ if full_path.endswith('hmh.opf') or full_path.endswith('content.opf'):
return self.__handle_contentopf(full_path)
elif full_path.endswith('OEBPS/toc.ncx'):
return self.__handle_tocncx(full_path)
+ elif re.search('/OPS/[^/]+.xml$', full_path):
+ return self.__handle_ops_xml(full_path)
return True
- def __handle_tocncx(self, full_path: str):
+ def __handle_ops_xml(self, full_path: str) -> bool:
+ try:
+ tree, namespace = office._parse_xml(full_path)
+ except ET.ParseError: # pragma: nocover
+ logging.error("Unable to parse %s in %s.", full_path, self.filename)
+ return False
+
+ for item in tree.iterfind('.//', namespace): # pragma: nocover
+ if item.tag.strip().lower().endswith('head'):
+ item.clear()
+ break
+ tree.write(full_path, xml_declaration=True, encoding='utf-8',
+ short_empty_elements=False)
+ return True
+
+
+ def __handle_tocncx(self, full_path: str) -> bool:
try:
tree, namespace = office._parse_xml(full_path)
except ET.ParseError: # pragma: nocover
@@ -53,7 +90,7 @@
short_empty_elements=False)
return True
- def __handle_contentopf(self, full_path: str):
+ def __handle_contentopf(self, full_path: str) -> bool:
try:
tree, namespace = office._parse_xml(full_path)
except ET.ParseError:
diff -Nru mat2-0.12.0/libmat2/office.py mat2-0.12.1/libmat2/office.py
--- mat2-0.12.0/libmat2/office.py 2020-12-18 16:55:41.000000000 +0000
+++ mat2-0.12.1/libmat2/office.py 2021-03-19 16:54:21.000000000 +0000
@@ -87,6 +87,7 @@
self.files_to_keep = set(map(re.compile, { # type: ignore
r'^\[Content_Types\]\.xml$',
r'^_rels/\.rels$',
+ r'^xl/sharedStrings\.xml$', # https://docs.microsoft.com/en-us/office/open-xml/working-with-the-shared-string-table
r'^(?:word|ppt|xl)/_rels/document\.xml\.rels$',
r'^(?:word|ppt|xl)/_rels/footer[0-9]*\.xml\.rels$',
r'^(?:word|ppt|xl)/_rels/header[0-9]*\.xml\.rels$',
@@ -108,6 +109,7 @@
r'^ppt/slideMasters/_rels/slideMaster[0-9]+\.xml\.rels',
}))
self.files_to_omit = set(map(re.compile, { # type: ignore
+ r'^\[trash\]/',
r'^customXml/',
r'webSettings\.xml$',
r'^docProps/custom\.xml$',
diff -Nru mat2-0.12.0/mat2 mat2-0.12.1/mat2
--- mat2-0.12.0/mat2 2020-12-18 16:55:41.000000000 +0000
+++ mat2-0.12.1/mat2 2021-03-19 16:54:21.000000000 +0000
@@ -17,7 +17,7 @@
print(e)
sys.exit(1)
-__version__ = '0.12.0'
+__version__ = '0.12.1'
# Make pyflakes happy
assert Set
diff -Nru mat2-0.12.0/setup.py mat2-0.12.1/setup.py
--- mat2-0.12.0/setup.py 2020-12-18 16:55:41.000000000 +0000
+++ mat2-0.12.1/setup.py 2021-03-19 16:54:21.000000000 +0000
@@ -5,7 +5,7 @@
setuptools.setup(
name="mat2",
- version='0.12.0',
+ version='0.12.1',
author="Julien (jvoisin) Voisin",
author_email="julien.voisin+mat2@dustri.org",
description="A handy tool to trash your metadata",
Reply to: