Bug#656288: python3-apt: difficulties with non-UTF-8-encoded TagFiles
On Wed, Jan 18, 2012 at 12:14:02PM +0100, Julian Andres Klode wrote:
> You'd also need to take care of TagSection if that is done, which should
> then work in bytes mode when passed a bytes string.
>
> Basically you'd need to modify TagSection and TagFile to both store whether
> to use bytes or unicode and pass the value of that flag from the TagFile
> to the TagSection. Then create a function
>
> PyObject *TagFile_ToString(char *s, size_t n)
>
> or similar that uses PyString_* functions or PyBytes_ functions depending
> on the context (where PyString is mapped to unicode in Python 3, and
> str in Python 2). Then use that function everywhere we currently
> create strings in the TagFile.
OK. How about something like this? I added both an explicit bytes=
parameter and a fallback which tries to detect the encoding from the
file object.
=== modified file 'python/tag.cc'
--- python/tag.cc 2011-11-10 16:20:58 +0000
+++ python/tag.cc 2012-01-20 14:47:56 +0000
@@ -38,6 +38,10 @@ using namespace std;
struct TagSecData : public CppPyObject<pkgTagSection>
{
char *Data;
+ bool Bytes;
+#if PY_MAJOR_VERSION >= 3
+ PyObject *Encoding;
+#endif
};
// The owner of the TagFile is a Python file object.
@@ -45,6 +49,10 @@ struct TagFileData : public CppPyObject<
{
TagSecData *Section;
FileFd Fd;
+ bool Bytes;
+#if PY_MAJOR_VERSION >= 3
+ PyObject *Encoding;
+#endif
};
// Traversal and Clean for owned objects
@@ -60,6 +68,35 @@ int TagFileClear(PyObject *self) {
return 0;
}
+// Helpers to return Unicode or bytes as appropriate.
+#if PY_MAJOR_VERSION < 3
+#define TagSecString_FromStringAndSize(self, v, len) \
+ PyString_FromStringAndSize((v), (len))
+#define TagSecString_FromString(self, v) PyString_FromString(v)
+#else
+PyObject *TagSecString_FromStringAndSize(PyObject *self, const char *v,
+ Py_ssize_t len) {
+ TagSecData *Self = (TagSecData *)self;
+ if (Self->Bytes)
+ return PyBytes_FromStringAndSize(v, len);
+ else if (Self->Encoding)
+ return PyUnicode_Decode(v, len, PyUnicode_AsString(Self->Encoding), 0);
+ else
+ return PyUnicode_FromStringAndSize(v, len);
+}
+
+PyObject *TagSecString_FromString(PyObject *self, const char *v) {
+ TagSecData *Self = (TagSecData *)self;
+ if (Self->Bytes)
+ return PyBytes_FromString(v);
+ else if (Self->Encoding)
+ return PyUnicode_Decode(v, strlen(v),
+ PyUnicode_AsString(Self->Encoding), 0);
+ else
+ return PyUnicode_FromString(v);
+}
+#endif
+
/*}}}*/
// TagSecFree - Free a Tag Section /*{{{*/
@@ -107,9 +144,9 @@ static PyObject *TagSecFind(PyObject *Se
{
if (Default == 0)
Py_RETURN_NONE;
- return PyString_FromString(Default);
+ return TagSecString_FromString(Self,Default);
}
- return PyString_FromStringAndSize(Start,Stop-Start);
+ return TagSecString_FromStringAndSize(Self,Start,Stop-Start);
}
static char *doc_FindRaw =
@@ -128,14 +165,14 @@ static PyObject *TagSecFindRaw(PyObject
{
if (Default == 0)
Py_RETURN_NONE;
- return PyString_FromString(Default);
+ return TagSecString_FromString(Self,Default);
}
const char *Start;
const char *Stop;
GetCpp<pkgTagSection>(Self).Get(Start,Stop,Pos);
- return PyString_FromStringAndSize(Start,Stop-Start);
+ return TagSecString_FromStringAndSize(Self,Start,Stop-Start);
}
static char *doc_FindFlag =
@@ -161,21 +198,18 @@ static PyObject *TagSecFindFlag(PyObject
// Map access, operator []
static PyObject *TagSecMap(PyObject *Self,PyObject *Arg)
{
- if (PyString_Check(Arg) == 0)
- {
- PyErr_SetNone(PyExc_TypeError);
+ const char *Name = PyObject_AsString(Arg);
+ if (Name == 0)
return 0;
- }
-
const char *Start;
const char *Stop;
- if (GetCpp<pkgTagSection>(Self).Find(PyString_AsString(Arg),Start,Stop) == false)
+ if (GetCpp<pkgTagSection>(Self).Find(Name,Start,Stop) == false)
{
- PyErr_SetString(PyExc_KeyError,PyString_AsString(Arg));
+ PyErr_SetString(PyExc_KeyError,Name);
return 0;
}
- return PyString_FromStringAndSize(Start,Stop-Start);
+ return TagSecString_FromStringAndSize(Self,Start,Stop-Start);
}
// len() operation
@@ -230,9 +264,9 @@ static PyObject *TagSecExists(PyObject *
static int TagSecContains(PyObject *Self,PyObject *Arg)
{
- if (PyString_Check(Arg) == 0)
- return 0;
- const char *Name = PyString_AsString(Arg);
+ const char *Name = PyObject_AsString(Arg);
+ if (Name == 0)
+ return 0;
const char *Start;
const char *Stop;
if (GetCpp<pkgTagSection>(Self).Find(Name,Start,Stop) == false)
@@ -256,7 +290,7 @@ static PyObject *TagSecStr(PyObject *Sel
const char *Start;
const char *Stop;
GetCpp<pkgTagSection>(Self).GetSection(Start,Stop);
- return PyString_FromStringAndSize(Start,Stop-Start);
+ return TagSecString_FromStringAndSize(Self,Start,Stop-Start);
}
/*}}}*/
// TagFile Wrappers /*{{{*/
@@ -286,6 +320,12 @@ static PyObject *TagFileNext(PyObject *S
Obj.Section->Owner = Self;
Py_INCREF(Obj.Section->Owner);
Obj.Section->Data = 0;
+ Obj.Section->Bytes = Obj.Bytes;
+#if PY_MAJOR_VERSION >= 3
+ // We don't need to incref Encoding as the previous Section object already
+ // held a reference to it.
+ Obj.Section->Encoding = Obj.Encoding;
+#endif
if (Obj.Object.Step(Obj.Section->Object) == false)
return HandleErrors(NULL);
@@ -347,11 +387,12 @@ static PyObject *TagFileJump(PyObject *S
static PyObject *TagSecNew(PyTypeObject *type,PyObject *Args,PyObject *kwds) {
char *Data;
int Len;
- char *kwlist[] = {"text", 0};
+ char Bytes = 0;
+ char *kwlist[] = {"text", "bytes", 0};
// this allows reading "byte" types from python3 - but we don't
// make (much) use of it yet
- if (PyArg_ParseTupleAndKeywords(Args,kwds,"s#",kwlist,&Data,&Len) == 0)
+ if (PyArg_ParseTupleAndKeywords(Args,kwds,"s#|b",kwlist,&Data,&Len,&Bytes) == 0)
return 0;
// Create the object..
@@ -359,6 +400,10 @@ static PyObject *TagSecNew(PyTypeObject
new (&New->Object) pkgTagSection();
New->Data = new char[strlen(Data)+2];
snprintf(New->Data,strlen(Data)+2,"%s\n",Data);
+ New->Bytes = Bytes;
+#if PY_MAJOR_VERSION >= 3
+ New->Encoding = 0;
+#endif
if (New->Object.Scan(New->Data,strlen(New->Data)) == false)
{
@@ -390,9 +435,10 @@ PyObject *ParseSection(PyObject *self,Py
static PyObject *TagFileNew(PyTypeObject *type,PyObject *Args,PyObject *kwds)
{
- PyObject *File;
- char *kwlist[] = {"file", 0};
- if (PyArg_ParseTupleAndKeywords(Args,kwds,"O",kwlist,&File) == 0)
+ PyObject *File = 0;
+ char Bytes = 0;
+ char *kwlist[] = {"file", "bytes", 0};
+ if (PyArg_ParseTupleAndKeywords(Args,kwds,"O|b",kwlist,&File,&Bytes) == 0)
return 0;
int fileno = PyObject_AsFileDescriptor(File);
if (fileno == -1)
@@ -405,8 +451,15 @@ static PyObject *TagFileNew(PyTypeObject
#else
new (&New->Fd) FileFd(fileno,false);
#endif
+ New->Bytes = Bytes;
New->Owner = File;
Py_INCREF(New->Owner);
+#if PY_MAJOR_VERSION >= 3
+ New->Encoding = PyObject_GetAttr(File, PyUnicode_FromString("encoding"));
+ if (!PyUnicode_Check(New->Encoding))
+ New->Encoding = 0;
+ Py_XINCREF(New->Encoding);
+#endif
new (&New->Object) pkgTagFile(&New->Fd);
// Create the section
@@ -415,6 +468,11 @@ static PyObject *TagFileNew(PyTypeObject
New->Section->Owner = New;
Py_INCREF(New->Section->Owner);
New->Section->Data = 0;
+ New->Section->Bytes = Bytes;
+#if PY_MAJOR_VERSION >= 3
+ New->Section->Encoding = New->Encoding;
+ Py_XINCREF(New->Section->Encoding);
+#endif
return HandleErrors(New);
}
@@ -492,7 +550,7 @@ PyObject *RewriteSection(PyObject *self,
}
// Return the string
- PyObject *ResObj = PyString_FromStringAndSize(bp,size);
+ PyObject *ResObj = TagSecString_FromStringAndSize(Section,bp,size);
free(bp);
return HandleErrors(ResObj);
}
@@ -521,11 +579,15 @@ PySequenceMethods TagSecSeqMeth = {0,0,0
PyMappingMethods TagSecMapMeth = {TagSecLength,TagSecMap,0};
-static char *doc_TagSec = "TagSection(text: str)\n\n"
+static char *doc_TagSec = "TagSection(text: str, [bytes: bool = False])\n\n"
"Provide methods to access RFC822-style header sections, like those\n"
"found in debian/control or Packages files.\n\n"
"TagSection() behave like read-only dictionaries and also provide access\n"
- "to the functions provided by the C++ class (e.g. find)";
+ "to the functions provided by the C++ class (e.g. find).\n\n"
+ "By default, text read from files is treated as strings (binary data in\n"
+ "Python 2, Unicode strings in Python 3). Use bytes=True to cause all\n"
+ "header values read from this TagSection to be bytes even in Python 3.\n"
+ "Header names are always treated as Unicode.";
PyTypeObject PyTagSection_Type =
{
PyVarObject_HEAD_INIT(&PyType_Type, 0)
@@ -596,7 +658,7 @@ static PyGetSetDef TagFileGetSet[] = {
};
-static char *doc_TagFile = "TagFile(file)\n\n"
+static char *doc_TagFile = "TagFile(file, [bytes: bool = False])\n\n"
"TagFile() objects provide access to debian control files, which consist\n"
"of multiple RFC822-style sections.\n\n"
"To provide access to those sections, TagFile objects provide an iterator\n"
@@ -608,7 +670,11 @@ static char *doc_TagFile = "TagFile(file
"It is important to not mix the use of both APIs, because this can have\n"
"unwanted effects.\n\n"
"The parameter 'file' refers to an object providing a fileno() method or\n"
- "a file descriptor (an integer)";
+ "a file descriptor (an integer).\n\n"
+ "By default, text read from files is treated as strings (binary data in\n"
+ "Python 2, Unicode strings in Python 3). Use bytes=True to cause all\n"
+ "header values read from this TagFile to be bytes even in Python 3.\n"
+ "Header names are always treated as Unicode.";
// Type for a Tag File
PyTypeObject PyTagFile_Type =
=== added file 'tests/test_tagfile.py'
--- tests/test_tagfile.py 1970-01-01 00:00:00 +0000
+++ tests/test_tagfile.py 2012-01-20 14:52:28 +0000
@@ -0,0 +1,106 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2012 Canonical Ltd.
+# Author: Colin Watson <cjwatson@ubuntu.com>
+#
+# Copying and distribution of this file, with or without modification,
+# are permitted in any medium without royalty provided the copyright
+# notice and this notice are preserved.
+"""Unit tests for verifying the correctness of apt_pkg.TagFile."""
+from __future__ import print_function, unicode_literals
+import io
+import os
+import shutil
+import sys
+import tempfile
+import unittest
+
+import apt_pkg
+
+
+class TestTagFile(unittest.TestCase):
+ """Test apt_pkg.TagFile."""
+
+ def setUp(self):
+ apt_pkg.init()
+ self.temp_dir = tempfile.mkdtemp()
+
+ def tearDown(self):
+ shutil.rmtree(self.temp_dir)
+
+ def test_utf8(self):
+ value = "Tést Persön <test@example.org>"
+ packages = os.path.join(self.temp_dir, "Packages")
+ with io.open(packages, "w", encoding="UTF-8") as packages_file:
+ print("Maintainer: %s" % value, file=packages_file)
+ print("", file=packages_file)
+ if sys.version < '3':
+ # In Python 2, test the traditional file interface.
+ with open(packages) as packages_file:
+ tagfile = apt_pkg.TagFile(packages_file)
+ tagfile.step()
+ self.assertEqual(
+ value.encode("UTF-8"), tagfile.section["Maintainer"])
+ with io.open(packages, encoding="UTF-8") as packages_file:
+ tagfile = apt_pkg.TagFile(packages_file)
+ tagfile.step()
+ if sys.version < '3':
+ self.assertEqual(
+ value.encode("UTF-8"), tagfile.section["Maintainer"])
+ else:
+ self.assertEqual(value, tagfile.section["Maintainer"])
+
+ def test_latin1(self):
+ value = "Tést Persön <test@example.org>"
+ packages = os.path.join(self.temp_dir, "Packages")
+ with io.open(packages, "w", encoding="ISO-8859-1") as packages_file:
+ print("Maintainer: %s" % value, file=packages_file)
+ print("", file=packages_file)
+ if sys.version < '3':
+ # In Python 2, test the traditional file interface.
+ with open(packages) as packages_file:
+ tagfile = apt_pkg.TagFile(packages_file)
+ tagfile.step()
+ self.assertEqual(
+ value.encode("ISO-8859-1"), tagfile.section["Maintainer"])
+ with io.open(packages) as packages_file:
+ tagfile = apt_pkg.TagFile(packages_file, bytes=True)
+ tagfile.step()
+ self.assertEqual(
+ value.encode("ISO-8859-1"), tagfile.section["Maintainer"])
+ if sys.version >= '3':
+ # In Python 3, TagFile can pick up the encoding of the file
+ # object.
+ with io.open(packages, encoding="ISO-8859-1") as packages_file:
+ tagfile = apt_pkg.TagFile(packages_file)
+ tagfile.step()
+ self.assertEqual(value, tagfile.section["Maintainer"])
+
+ def test_mixed(self):
+ value = "Tést Persön <test@example.org>"
+ packages = os.path.join(self.temp_dir, "Packages")
+ with io.open(packages, "w", encoding="UTF-8") as packages_file:
+ print("Maintainer: %s" % value, file=packages_file)
+ print("", file=packages_file)
+ with io.open(packages, "a", encoding="ISO-8859-1") as packages_file:
+ print("Maintainer: %s" % value, file=packages_file)
+ print("", file=packages_file)
+ if sys.version < '3':
+ # In Python 2, test the traditional file interface.
+ with open(packages) as packages_file:
+ tagfile = apt_pkg.TagFile(packages_file)
+ tagfile.step()
+ self.assertEqual(
+ value.encode("UTF-8"), tagfile.section["Maintainer"])
+ tagfile.step()
+ self.assertEqual(
+ value.encode("ISO-8859-1"), tagfile.section["Maintainer"])
+ with io.open(packages) as packages_file:
+ tagfile = apt_pkg.TagFile(packages_file, bytes=True)
+ tagfile.step()
+ self.assertEqual(
+ value.encode("UTF-8"), tagfile.section["Maintainer"])
+ tagfile.step()
+ self.assertEqual(
+ value.encode("ISO-8859-1"), tagfile.section["Maintainer"])
--
Colin Watson [cjwatson@ubuntu.com]
Reply to: