[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Bug#656288: python3-apt: difficulties with non-UTF-8-encoded TagFiles



On Wed, Jan 18, 2012 at 12:14:02PM +0100, Julian Andres Klode wrote:
> You'd also need to take care of TagSection if that is done, which should
> then work in bytes mode when passed a bytes string.
> 
> Basically you'd need to modify TagSection and TagFile to both store whether
> to use bytes or unicode and pass the value of that flag from the TagFile
> to the TagSection. Then create a function
> 
> 	PyObject *TagFile_ToString(char *s, size_t n)
> 
> or similar that uses PyString_* functions or PyBytes_ functions depending
> on the context (where PyString is mapped to unicode in Python 3, and
> str in Python 2). Then use that function everywhere we currently
> create strings in the TagFile.

OK.  How about something like this?  I added both an explicit bytes=
parameter and a fallback which tries to detect the encoding from the
file object.

=== modified file 'python/tag.cc'
--- python/tag.cc	2011-11-10 16:20:58 +0000
+++ python/tag.cc	2012-01-20 14:47:56 +0000
@@ -38,6 +38,10 @@ using namespace std;
 struct TagSecData : public CppPyObject<pkgTagSection>
 {
    char *Data;
+   bool Bytes;
+#if PY_MAJOR_VERSION >= 3
+   PyObject *Encoding;
+#endif
 };
 
 // The owner of the TagFile is a Python file object.
@@ -45,6 +49,10 @@ struct TagFileData : public CppPyObject<
 {
    TagSecData *Section;
    FileFd Fd;
+   bool Bytes;
+#if PY_MAJOR_VERSION >= 3
+   PyObject *Encoding;
+#endif
 };
 
 // Traversal and Clean for owned objects
@@ -60,6 +68,35 @@ int TagFileClear(PyObject *self) {
     return 0;
 }
 
+// Helpers to return Unicode or bytes as appropriate.
+#if PY_MAJOR_VERSION < 3
+#define TagSecString_FromStringAndSize(self, v, len) \
+    PyString_FromStringAndSize((v), (len))
+#define TagSecString_FromString(self, v) PyString_FromString(v)
+#else
+PyObject *TagSecString_FromStringAndSize(PyObject *self, const char *v,
+	 				 Py_ssize_t len) {
+   TagSecData *Self = (TagSecData *)self;
+   if (Self->Bytes)
+      return PyBytes_FromStringAndSize(v, len);
+   else if (Self->Encoding)
+      return PyUnicode_Decode(v, len, PyUnicode_AsString(Self->Encoding), 0);
+   else
+      return PyUnicode_FromStringAndSize(v, len);
+}
+
+PyObject *TagSecString_FromString(PyObject *self, const char *v) {
+   TagSecData *Self = (TagSecData *)self;
+   if (Self->Bytes)
+      return PyBytes_FromString(v);
+   else if (Self->Encoding)
+      return PyUnicode_Decode(v, strlen(v),
+			      PyUnicode_AsString(Self->Encoding), 0);
+   else
+      return PyUnicode_FromString(v);
+}
+#endif
+
 
 									/*}}}*/
 // TagSecFree - Free a Tag Section					/*{{{*/
@@ -107,9 +144,9 @@ static PyObject *TagSecFind(PyObject *Se
    {
       if (Default == 0)
 	 Py_RETURN_NONE;
-      return PyString_FromString(Default);
+      return TagSecString_FromString(Self,Default);
    }
-   return PyString_FromStringAndSize(Start,Stop-Start);
+   return TagSecString_FromStringAndSize(Self,Start,Stop-Start);
 }
 
 static char *doc_FindRaw =
@@ -128,14 +165,14 @@ static PyObject *TagSecFindRaw(PyObject
    {
       if (Default == 0)
 	 Py_RETURN_NONE;
-      return PyString_FromString(Default);
+      return TagSecString_FromString(Self,Default);
    }
 
    const char *Start;
    const char *Stop;
    GetCpp<pkgTagSection>(Self).Get(Start,Stop,Pos);
 
-   return PyString_FromStringAndSize(Start,Stop-Start);
+   return TagSecString_FromStringAndSize(Self,Start,Stop-Start);
 }
 
 static char *doc_FindFlag =
@@ -161,21 +198,18 @@ static PyObject *TagSecFindFlag(PyObject
 // Map access, operator []
 static PyObject *TagSecMap(PyObject *Self,PyObject *Arg)
 {
-   if (PyString_Check(Arg) == 0)
-   {
-      PyErr_SetNone(PyExc_TypeError);
+   const char *Name = PyObject_AsString(Arg);
+   if (Name == 0)
       return 0;
-   }
-
    const char *Start;
    const char *Stop;
-   if (GetCpp<pkgTagSection>(Self).Find(PyString_AsString(Arg),Start,Stop) == false)
+   if (GetCpp<pkgTagSection>(Self).Find(Name,Start,Stop) == false)
    {
-      PyErr_SetString(PyExc_KeyError,PyString_AsString(Arg));
+      PyErr_SetString(PyExc_KeyError,Name);
       return 0;
    }
 
-   return PyString_FromStringAndSize(Start,Stop-Start);
+   return TagSecString_FromStringAndSize(Self,Start,Stop-Start);
 }
 
 // len() operation
@@ -230,9 +264,9 @@ static PyObject *TagSecExists(PyObject *
 
 static int TagSecContains(PyObject *Self,PyObject *Arg)
 {
-   if (PyString_Check(Arg) == 0)
-       return 0;
-   const char *Name = PyString_AsString(Arg);
+   const char *Name = PyObject_AsString(Arg);
+   if (Name == 0)
+      return 0;
    const char *Start;
    const char *Stop;
    if (GetCpp<pkgTagSection>(Self).Find(Name,Start,Stop) == false)
@@ -256,7 +290,7 @@ static PyObject *TagSecStr(PyObject *Sel
    const char *Start;
    const char *Stop;
    GetCpp<pkgTagSection>(Self).GetSection(Start,Stop);
-   return PyString_FromStringAndSize(Start,Stop-Start);
+   return TagSecString_FromStringAndSize(Self,Start,Stop-Start);
 }
 									/*}}}*/
 // TagFile Wrappers							/*{{{*/
@@ -286,6 +320,12 @@ static PyObject *TagFileNext(PyObject *S
    Obj.Section->Owner = Self;
    Py_INCREF(Obj.Section->Owner);
    Obj.Section->Data = 0;
+   Obj.Section->Bytes = Obj.Bytes;
+#if PY_MAJOR_VERSION >= 3
+   // We don't need to incref Encoding as the previous Section object already
+   // held a reference to it.
+   Obj.Section->Encoding = Obj.Encoding;
+#endif
    if (Obj.Object.Step(Obj.Section->Object) == false)
       return HandleErrors(NULL);
 
@@ -347,11 +387,12 @@ static PyObject *TagFileJump(PyObject *S
 static PyObject *TagSecNew(PyTypeObject *type,PyObject *Args,PyObject *kwds) {
    char *Data;
    int Len;
-   char *kwlist[] = {"text", 0};
+   char Bytes = 0;
+   char *kwlist[] = {"text", "bytes", 0};
 
    // this allows reading "byte" types from python3 - but we don't
    // make (much) use of it yet
-   if (PyArg_ParseTupleAndKeywords(Args,kwds,"s#",kwlist,&Data,&Len) == 0)
+   if (PyArg_ParseTupleAndKeywords(Args,kwds,"s#|b",kwlist,&Data,&Len,&Bytes) == 0)
       return 0;
 
    // Create the object..
@@ -359,6 +400,10 @@ static PyObject *TagSecNew(PyTypeObject
    new (&New->Object) pkgTagSection();
    New->Data = new char[strlen(Data)+2];
    snprintf(New->Data,strlen(Data)+2,"%s\n",Data);
+   New->Bytes = Bytes;
+#if PY_MAJOR_VERSION >= 3
+   New->Encoding = 0;
+#endif
 
    if (New->Object.Scan(New->Data,strlen(New->Data)) == false)
    {
@@ -390,9 +435,10 @@ PyObject *ParseSection(PyObject *self,Py
 
 static PyObject *TagFileNew(PyTypeObject *type,PyObject *Args,PyObject *kwds)
 {
-   PyObject *File;
-   char *kwlist[] = {"file", 0};
-   if (PyArg_ParseTupleAndKeywords(Args,kwds,"O",kwlist,&File) == 0)
+   PyObject *File = 0;
+   char Bytes = 0;
+   char *kwlist[] = {"file", "bytes", 0};
+   if (PyArg_ParseTupleAndKeywords(Args,kwds,"O|b",kwlist,&File,&Bytes) == 0)
       return 0;
    int fileno = PyObject_AsFileDescriptor(File);
    if (fileno == -1)
@@ -405,8 +451,15 @@ static PyObject *TagFileNew(PyTypeObject
 #else
    new (&New->Fd) FileFd(fileno,false);
 #endif
+   New->Bytes = Bytes;
    New->Owner = File;
    Py_INCREF(New->Owner);
+#if PY_MAJOR_VERSION >= 3
+   New->Encoding = PyObject_GetAttr(File, PyUnicode_FromString("encoding"));
+   if (!PyUnicode_Check(New->Encoding))
+      New->Encoding = 0;
+   Py_XINCREF(New->Encoding);
+#endif
    new (&New->Object) pkgTagFile(&New->Fd);
 
    // Create the section
@@ -415,6 +468,11 @@ static PyObject *TagFileNew(PyTypeObject
    New->Section->Owner = New;
    Py_INCREF(New->Section->Owner);
    New->Section->Data = 0;
+   New->Section->Bytes = Bytes;
+#if PY_MAJOR_VERSION >= 3
+   New->Section->Encoding = New->Encoding;
+   Py_XINCREF(New->Section->Encoding);
+#endif
 
    return HandleErrors(New);
 }
@@ -492,7 +550,7 @@ PyObject *RewriteSection(PyObject *self,
    }
 
    // Return the string
-   PyObject *ResObj = PyString_FromStringAndSize(bp,size);
+   PyObject *ResObj = TagSecString_FromStringAndSize(Section,bp,size);
    free(bp);
    return HandleErrors(ResObj);
 }
@@ -521,11 +579,15 @@ PySequenceMethods TagSecSeqMeth = {0,0,0
 PyMappingMethods TagSecMapMeth = {TagSecLength,TagSecMap,0};
 
 
-static char *doc_TagSec = "TagSection(text: str)\n\n"
+static char *doc_TagSec = "TagSection(text: str, [bytes: bool = False])\n\n"
    "Provide methods to access RFC822-style header sections, like those\n"
    "found in debian/control or Packages files.\n\n"
    "TagSection() behave like read-only dictionaries and also provide access\n"
-   "to the functions provided by the C++ class (e.g. find)";
+   "to the functions provided by the C++ class (e.g. find).\n\n"
+   "By default, text read from files is treated as strings (binary data in\n"
+   "Python 2, Unicode strings in Python 3). Use bytes=True to cause all\n"
+   "header values read from this TagSection to be bytes even in Python 3.\n"
+   "Header names are always treated as Unicode.";
 PyTypeObject PyTagSection_Type =
 {
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
@@ -596,7 +658,7 @@ static PyGetSetDef TagFileGetSet[] = {
 };
 
 
-static char *doc_TagFile = "TagFile(file)\n\n"
+static char *doc_TagFile = "TagFile(file, [bytes: bool = False])\n\n"
    "TagFile() objects provide access to debian control files, which consist\n"
    "of multiple RFC822-style sections.\n\n"
    "To provide access to those sections, TagFile objects provide an iterator\n"
@@ -608,7 +670,11 @@ static char *doc_TagFile = "TagFile(file
    "It is important to not mix the use of both APIs, because this can have\n"
    "unwanted effects.\n\n"
    "The parameter 'file' refers to an object providing a fileno() method or\n"
-   "a file descriptor (an integer)";
+   "a file descriptor (an integer).\n\n"
+   "By default, text read from files is treated as strings (binary data in\n"
+   "Python 2, Unicode strings in Python 3). Use bytes=True to cause all\n"
+   "header values read from this TagFile to be bytes even in Python 3.\n"
+   "Header names are always treated as Unicode.";
 
 // Type for a Tag File
 PyTypeObject PyTagFile_Type =

=== added file 'tests/test_tagfile.py'
--- tests/test_tagfile.py	1970-01-01 00:00:00 +0000
+++ tests/test_tagfile.py	2012-01-20 14:52:28 +0000
@@ -0,0 +1,106 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2012 Canonical Ltd.
+# Author: Colin Watson <cjwatson@ubuntu.com>
+#
+# Copying and distribution of this file, with or without modification,
+# are permitted in any medium without royalty provided the copyright
+# notice and this notice are preserved.
+"""Unit tests for verifying the correctness of apt_pkg.TagFile."""
+from __future__ import print_function, unicode_literals
+import io
+import os
+import shutil
+import sys
+import tempfile
+import unittest
+
+import apt_pkg
+
+
+class TestTagFile(unittest.TestCase):
+    """Test apt_pkg.TagFile."""
+
+    def setUp(self):
+        apt_pkg.init()
+        self.temp_dir = tempfile.mkdtemp()
+
+    def tearDown(self):
+        shutil.rmtree(self.temp_dir)
+
+    def test_utf8(self):
+        value = "Tést Persön <test@example.org>"
+        packages = os.path.join(self.temp_dir, "Packages")
+        with io.open(packages, "w", encoding="UTF-8") as packages_file:
+            print("Maintainer: %s" % value, file=packages_file)
+            print("", file=packages_file)
+        if sys.version < '3':
+            # In Python 2, test the traditional file interface.
+            with open(packages) as packages_file:
+                tagfile = apt_pkg.TagFile(packages_file)
+                tagfile.step()
+                self.assertEqual(
+                    value.encode("UTF-8"), tagfile.section["Maintainer"])
+        with io.open(packages, encoding="UTF-8") as packages_file:
+            tagfile = apt_pkg.TagFile(packages_file)
+            tagfile.step()
+            if sys.version < '3':
+                self.assertEqual(
+                    value.encode("UTF-8"), tagfile.section["Maintainer"])
+            else:
+                self.assertEqual(value, tagfile.section["Maintainer"])
+
+    def test_latin1(self):
+        value = "Tést Persön <test@example.org>"
+        packages = os.path.join(self.temp_dir, "Packages")
+        with io.open(packages, "w", encoding="ISO-8859-1") as packages_file:
+            print("Maintainer: %s" % value, file=packages_file)
+            print("", file=packages_file)
+        if sys.version < '3':
+            # In Python 2, test the traditional file interface.
+            with open(packages) as packages_file:
+                tagfile = apt_pkg.TagFile(packages_file)
+                tagfile.step()
+                self.assertEqual(
+                    value.encode("ISO-8859-1"), tagfile.section["Maintainer"])
+        with io.open(packages) as packages_file:
+            tagfile = apt_pkg.TagFile(packages_file, bytes=True)
+            tagfile.step()
+            self.assertEqual(
+                value.encode("ISO-8859-1"), tagfile.section["Maintainer"])
+        if sys.version >= '3':
+            # In Python 3, TagFile can pick up the encoding of the file
+            # object.
+            with io.open(packages, encoding="ISO-8859-1") as packages_file:
+                tagfile = apt_pkg.TagFile(packages_file)
+                tagfile.step()
+                self.assertEqual(value, tagfile.section["Maintainer"])
+
+    def test_mixed(self):
+        value = "Tést Persön <test@example.org>"
+        packages = os.path.join(self.temp_dir, "Packages")
+        with io.open(packages, "w", encoding="UTF-8") as packages_file:
+            print("Maintainer: %s" % value, file=packages_file)
+            print("", file=packages_file)
+        with io.open(packages, "a", encoding="ISO-8859-1") as packages_file:
+            print("Maintainer: %s" % value, file=packages_file)
+            print("", file=packages_file)
+        if sys.version < '3':
+            # In Python 2, test the traditional file interface.
+            with open(packages) as packages_file:
+                tagfile = apt_pkg.TagFile(packages_file)
+                tagfile.step()
+                self.assertEqual(
+                    value.encode("UTF-8"), tagfile.section["Maintainer"])
+                tagfile.step()
+                self.assertEqual(
+                    value.encode("ISO-8859-1"), tagfile.section["Maintainer"])
+        with io.open(packages) as packages_file:
+            tagfile = apt_pkg.TagFile(packages_file, bytes=True)
+            tagfile.step()
+            self.assertEqual(
+                value.encode("UTF-8"), tagfile.section["Maintainer"])
+            tagfile.step()
+            self.assertEqual(
+                value.encode("ISO-8859-1"), tagfile.section["Maintainer"])

-- 
Colin Watson                                       [cjwatson@ubuntu.com]



Reply to: