Re: Script to filter debdiff output
Hi,
Frank Küster <frank@debian.org> wrote:
> I dare ask one more thing:
>
> Could you add a feature for moved conffiles?
Done in the attached script. I'll document it later, sorry.
Basically:
If a file is found somewhere so that the full path matches
one of the regexes in config_dirs_regexes (defined at the top, currently
only containing "^/etc/texmf"), and this file is also present in the
other .deb:
get, for each .deb, the list of directories where the file appears
if the sets of these directories differ between first deb and second deb:
remove all entries for this file from the debdiff output
display these entries in a separate section at the end of the
output, named "Moved configuration files [computed by
tex-filter-debdiff.py]"; this display is organized, first by file
basename, and second, by .deb, listing for each .deb the directories
where the file can be found, to ease comparison.
Smoke tests run fine, proper documentation and maybe small refactoring will
come later. :)
#! /usr/bin/env python
# tex-filter-debdiff.py --- Filter debdiff output
# Copyright (c) 2007 Florent Rougon
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; version 2 dated June, 1991.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; see the file COPYING. If not, write to the
# Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
# Boston, MA 02110-1301 USA.
import sys, os, re
# The filtering will only be done for files whose path (as reported in the
# debdiff output) matches one of the following regular expressions.
#
# Note: "^/usr/share/doc/texlive" matches /usr/share/doc/texlive-lang-french
# and many others...
filter_in_regexes = [r"^/usr/share/texmf-texlive",
r"^/usr/share/doc/texlive"]
config_dirs_regexes = [r"^/etc/texmf"]
filter_in = map(re.compile, filter_in_regexes)
config_dirs = map(re.compile, config_dirs_regexes)
# Regular expressions matching the "interesting" sections (those that will
# be filtered).
first_deb_sec_rec = re.compile(r"^Files in first \.deb but not in second$")
second_deb_sec_rec = re.compile(r"^Files in second \.deb but not in first$")
progname = os.path.basename(sys.argv[0])
class error(Exception):
pass
class ParseError(error):
pass
class ProgramError(error):
"Exception raised for obvious bugs (when an assertion is false)."
def split_input_into_sections(f):
sections = []
# How sections are underlined in the input
sec_delim = re.compile(r"^-+$")
section_number = 1
while True:
section = {"name": None,
"lines": []}
# Will store the previous line (needed to remember section titles)
prev_line = None
# Line number within a section
line_num = 1
while True:
line = f.readline()
if line in ('', '\n'):
break
mo = sec_delim.match(line)
# Section delimiters are only considered as such iff found on
# the second line of a section.
if mo and (line_num == 2):
if prev_line is None:
raise ParseError(
"Section %u, line %u (within the section): section "
"delimiter not preceded by a section title",
section_number, line_num)
# The section title will be stored in section["name"];
# therefore, remove it from section["lines"], i.e., start over
# since we are on the second line.
section["lines"] = []
# Strip the trailing newline before storing the section title
section["name"] = prev_line[:-1]
# Store the section delimiter, in order to reproduce the
# debdiff output verbatim.
section["title delimiter"] = mo.group(0)
else:
# Strip the trailing newline before storing the line
section["lines"].append(line[:-1])
prev_line = line
line_num += 1
sections.append(section)
if line == '': # EOF
break
section_number += 1
return sections
def locate_interesting_sections(sections):
first_deb_sec, second_deb_sec = None, None
for section in sections:
if section["name"] is not None:
if first_deb_sec_rec.match(section["name"]):
first_deb_sec = section
elif second_deb_sec_rec.match(section["name"]):
second_deb_sec = section
return first_deb_sec, second_deb_sec
def index_files(section):
"""Build a dictionary whose keys are the file basenames.
This allows to easily find everything pertaining to a file given his
basename.
"""
if not section.has_key("files"):
section["files"] = {}
if not section.has_key("ordered list"):
# We are going to store in section["ordered list"] a list made of
# the file entries for every file in section["lines"], in the same
# order as they appear in section["lines"]. This will be quite useful
# in dump_filtered_section() to preserve the order of files in the
# debdiff output.
section["ordered list"] = []
d = section["files"] # 'd' for dictionary
# The point of having the "sep_after_mode", "sep_after_owner_and_group"
# and "symlink_arrow" groups is to recreate exactly the debdiff output
# for each entry (in particular, use the same number of spaces/tabs
# that were used by debdiff between the various fields).
nonsymlink_line_rec = re.compile(
r"^(?P<mode>[^l][^ \t]+)(?P<sep_after_mode>[ \t]+)"
r"(?P<owner_and_group>[^ \t]+)(?P<sep_after_owner_and_group>[ \t]+)"
r"(?P<path>.+?)$")
symlink_line_rec = re.compile(
r"^(?P<mode>l[^ \t]+)(?P<sep_after_mode>[ \t]+)"
r"(?P<owner_and_group>[^ \t]+)(?P<sep_after_owner_and_group>[ \t]+)"
r"(?P<link>.+?)(?P<symlink_arrow> -> )(?P<target>.+)$")
for line in section["lines"]:
if line.startswith("l"):
mo = symlink_line_rec.match(line)
if not mo:
raise ParseError(
"Looks like a line for a symlink, but doesn't match the "
"corresponding regexp:\n\n '%s'" % line)
name = os.path.basename(mo.group("link"))
if not d.has_key(name):
# First time we find a file with basename 'name' -> create
# a new entry for it.
d[name] = []
entry = \
{"name": name,
"type": "symlink",
"dirname": os.path.dirname(mo.group("link")),
"mode": mo.group("mode"),
"owner and group": mo.group("owner_and_group"),
"target": mo.group("target"),
"separator after mode": mo.group("sep_after_mode"),
"separator after owner and group":
mo.group("sep_after_owner_and_group"),
"symlink arrow": mo.group("symlink_arrow")}
else:
mo = nonsymlink_line_rec.match(line)
if not mo:
raise ParseError(
"Looks like a line for a file that is not a symlink, but "
"doesn't match the corresponding regexp:\n\n '%s'" % line)
name = os.path.basename(mo.group("path"))
if not d.has_key(name):
# First time we find a file with basename 'name' -> create
# a new entry for it.
d[name] = []
entry = \
{"name": name,
"type": "not a symlink",
"dirname": os.path.dirname(mo.group("path")),
"mode": mo.group("mode"),
"owner and group": mo.group("owner_and_group"),
"separator after mode": mo.group("sep_after_mode"),
"separator after owner and group":
mo.group("sep_after_owner_and_group")}
# Record all this precious data...
# ... first, in section["files"][name]:
d[name].append(entry)
# and second, append a pointer to the file entry to
# section["ordered list"], which will allow us to reproduce
# debdiff's output correctly, preserving the order:
section["ordered list"].append(entry)
def write_section_title(output, section):
if section["name"] is not None:
output.write("%s\n%s\n" % (section["name"],
section["title delimiter"]))
def dump_unfiltered_section(output, section):
"""Dump a section verbatim."""
# Section title, if any
write_section_title(output, section)
# Section contents
for line in section["lines"]:
output.write("%s\n" % line)
def dirnames_are_equivalent(dirname1, dirname2):
"""Tell whether two dirnames are to be considered equivalent by dump_filtered_section().
Current implementation: two dirnames are considered equivalent if, and
only if, they are equal or only differ in the last component.
"""
return (os.path.dirname(dirname1) == os.path.dirname(dirname2))
def file_entry(entry):
# Build the line for this file entry
line = []
for component in ("mode", "separator after mode", "owner and group",
"separator after owner and group"):
line.append(entry[component])
full_path = os.path.join(entry["dirname"], entry["name"])
if entry["type"] == "symlink":
line.append(full_path)
for component in ("symlink arrow", "target"):
line.append(entry[component])
elif entry["type"] == "not a symlink":
line.append(full_path)
else:
raise ProgramError(
"Unexpected entry type '%s' for '%s' in section '%s'." %
(entry["type"], full_path, section["name"]))
return ''.join(line)
def build_dir_list_for_file(name, section):
res = []
if section["files"].has_key(name):
for entry in section["files"][name]:
res.append(entry["dirname"])
return res
def process_filtered_section(output, sec, other_sec, filter_in=None,
config_dirs=None):
"""Dump section 'sec' in a filtered way, depending on 'other_sec'.
XXX out of date (WRT config_dirs)
The algorithm used for the filtering is the following one:
For every file entry in 'sec':
if:
(1) either 'filter_in' is None, or the full path of the entry
(dirname + basename) matches at least one of the compiled
regular expressions in 'filter_in', and
(2) there is a corresponding entry in 'other_sec' with the same file
basename, and
(3) the dirnames of these two entries are considered equivalent by
dirnames_are_equivalent() and
(4) both entries have the same mode, owner and group
then:
do nothing
else:
print the file entry unmodified.
Notes:
(a) The condition (1), when 'filter_in' is not None, ensures that we
don't filter out changes that ought to be noticed (e.g., for TeX
Live, we typically want to filter out those files which fulfill the
other conditions only if they appear under /usr/share/texmf-texlive/
or /usr/share/doc/texlive*, but not under /usr/lib/!).
(b) 'sec' and 'other_sec' typically correspond to those parts of
debdiff's output labeled "Files in first .deb but not in second"
and "Files in second .deb but not in first".
"""
moved_config_files = {}
# Section title, if any
write_section_title(output, sec)
# Section contents
for entry in sec["ordered list"]:
name, dirname = (entry["name"], entry["dirname"])
full_path = os.path.join(dirname, name)
# Handling of files under config_dirs that were moved
if config_dirs is not None:
for regexp in config_dirs:
if regexp.match(full_path):
dirs_in_this_section = build_dir_list_for_file(name, sec)
if not other_sec["files"].has_key(name):
# If the file cannot be found in the other section,
# perform no special treatment: list it as debdiff
# would.
break
dirs_in_other_section = build_dir_list_for_file(name,
other_sec)
dirs_in_this_section.sort()
dirs_in_other_section.sort()
if dirs_in_this_section != dirs_in_other_section:
# There will be some redundancy in this data
# structure, with the section names.
moved_config_files[name] = ((sec["name"],
sec["files"][name]),
(other_sec["name"],
other_sec["files"][name]))
# Remove the corresponding entries from
# sec["ordered list"] and other_sec["ordered list"].
for section in (sec, other_sec):
for entry in section["files"][name]:
section["ordered list"].remove(entry)
passes_through_regexp_filter = False
if filter_in is None:
passes_through_regexp_filter = True
else:
for regexp in filter_in:
if regexp.match(full_path):
passes_through_regexp_filter = True
break
filtered_out = False
if passes_through_regexp_filter:
if other_sec["files"].has_key(name):
for other in other_sec["files"][name]:
# Note: if both entries ('entry' and 'other') have the
# same mode, then they are necessarily of the same type
# (symlink / not symlink). Therefore, it is useless to
# compare the types, since we already compare the modes.
if dirnames_are_equivalent(dirname, other["dirname"]) \
and (entry["mode"] == other["mode"]) \
and (entry["owner and group"] \
== other["owner and group"]):
filtered_out = True
break
if not filtered_out:
output.write(file_entry(entry) + '\n')
return moved_config_files
def short_sec_name(section_name):
if first_deb_sec_rec.match(section_name):
res = "First deb"
elif second_deb_sec_rec.match(section_name):
res = "Second deb"
else:
raise ProgramError("Unexpected section name here...")
return res
def flush_chunk(chunks, lines):
lines.append('')
chunks.append('\n'.join(lines))
# Turn 'lines' into the empty list (in-place modification)
del lines[:]
def append_indented_line(string, lines, indentation):
lines.append((' ' * indentation) + string)
def main():
sections = split_input_into_sections(sys.stdin)
# Locate the sections "Files in second .deb but not in first"
# and "Files in first .deb but not in second"
first_deb_sec, second_deb_sec = locate_interesting_sections(sections)
# It is only useful to index the "interesting" sections if both of them
# are present (otherwise, we'll just dump them verbatim).
if (first_deb_sec is not None) and (second_deb_sec is not None):
for section in first_deb_sec, second_deb_sec:
index_files(section)
# No section separator (newline) should be printed before the first section
print_section_separator = False
output = sys.stdout
for section in sections:
if print_section_separator:
output.write('\n')
else:
print_section_separator = True
if section["name"] is None:
dump_unfiltered_section(output, section)
elif first_deb_sec_rec.match(section["name"]):
if second_deb_sec is not None:
moved_cfg_files_first = \
process_filtered_section(output, first_deb_sec, second_deb_sec,
filter_in, config_dirs)
elif second_deb_sec_rec.match(section["name"]):
if first_deb_sec is not None:
moved_cfg_files_second = \
process_filtered_section(output, second_deb_sec, first_deb_sec,
filter_in, config_dirs)
else:
dump_unfiltered_section(output, section)
if moved_cfg_files_first or moved_cfg_files_second:
if moved_cfg_files_first and moved_cfg_files_second:
raise ProgramError('The first processed filtered section should '
'have "grabbed" all moved config files')
# Find the one between moved_cfg_files_first and
# moved_cfg_files_second that is not an empty dictionary.
if moved_cfg_files_first:
moved_cfg_files = moved_cfg_files_first
else:
moved_cfg_files = moved_cfg_files_second
# Chunks of text, separated by a newline
chunks = ['']
lines = []
title = "Moved configuration files [computed by %s]" % progname
lines.append(title)
lines.append('-' * len(title))
indentation = 0
for name, data in moved_cfg_files.iteritems():
flush_chunk(chunks, lines)
lines.append(name)
indentation += 2
# Iterate over the two sections
print_section_separator = False
for sec_data in data:
if print_section_separator:
flush_chunk(chunks, lines)
else:
print_section_separator = True
append_indented_line(short_sec_name(sec_data[0]),
lines, indentation)
indentation += 2
for entry in sec_data[1]:
append_indented_line(file_entry(entry),
lines, indentation)
indentation -= 2
indentation -= 2
flush_chunk(chunks, lines)
output.write('\n'.join(chunks))
sys.exit(0)
if __name__ == "__main__": main()
--
Florent
Reply to: