[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

Re: [PATCH] Make use of a separate script for generation of debian.{bib,tex} files.



Hi,

I have created a patch for using a separate script file generation of debian.bib and debian.tex files. However, I have a couple of questions before I submit it. Please find my questions inline.

On Wed, Apr 22, 2015 at 7:07 PM, Akshita Jha <zenith158@gmail.com> wrote:
---
 udd/bibref_gatherer.py             | 110 +-----------------
 udd/blends_prospective_gatherer.py |   4 +
 udd/generate_bib_tex.py            | 226 +++++++++++++++++++++++++++++++++++++
 3 files changed, 235 insertions(+), 105 deletions(-)
 create mode 100644 udd/generate_bib_tex.py

diff --git a/udd/bibref_gatherer.py b/udd/bibref_gatherer.py
index 654d7e7..7113e04 100644
--- a/udd/bibref_gatherer.py
+++ b/udd/bibref_gatherer.py
@@ -6,8 +6,7 @@ This script imports bibliographic references from upstream-metadata.debian.net.

 from gatherer import gatherer
 from sys import stderr, exit
-from os import listdir, unlink, rename, access, X_OK
-from os.path import isfile
+from os import listdir
 from fnmatch import fnmatch
 import yaml
 from psycopg2 import IntegrityError, InternalError
@@ -18,48 +17,16 @@ from subprocess import Popen, PIPE

 from types import *

+from generate_bib_tex import generate_bib_tex
+

'generate_bib_tex' is the new file that is created. Is the name of the file alright? Is it according to the naming convention followed by Debian ?

 debug=0

 def get_gatherer(connection, config, source):
   return bibref_gatherer(connection, config, source)

-def rm_f(file):
-  try:
-    unlink(file)
-  except OSError:
-    pass
-
-def cleanup_tex_logs(basetexfile):
-  rm_f(basetexfile+'.aux')
-  rm_f(basetexfile+'.bbl')
-  rm_f(basetexfile+'.blg')
-  rm_f(basetexfile+'.log')
-
 # seek for authors separated by ',' rather than by ' and '
 seek_broken_authors_re = re.compile('^[^\s^,]+\s+[^\s^,]+\s*,\s*[^\s^,]+\s+[^\s^,]')

-def open_tex_process(texexe, basetexfile):
-  if texexe == 'pdflatex':
-    ptex = Popen(['pdflatex', '-interaction=batchmode', basetexfile], shell=False, stdout=PIPE)
-  elif texexe == 'bibtex':
-    ptex = Popen(['bibtex', basetexfile], shell=False, stdout=PIPE)
-  else:
-    return(False, 'Wrong exe: '+texexe)
-  errstring=""
-  if ptex.wait():
-    if texexe == 'pdflatex':
-      for logrow in ptex.communicate()[0].splitlines():
-        if logrow.startswith('!'):
-          errstring += logrow
-      return(False, errstring)
-    else:
-      for logrow in ptex.communicate()[0].splitlines():
-        if logrow.startswith('This is BibTeX'):
-          continue
-        errstring += logrow + '\n'
-      return(True, errstring)
-  return(True, errstring)
-
 other_known_keys = ('Archive',
                     'Bug-Database',
                     'Cite-As',
@@ -364,75 +331,8 @@ class bibref_gatherer(gatherer):
     # commit before check to make sure the table is not locked in case LaTeX run will fail for whatever reason
     self.connection.commit()

-    # if there is a working LaTeX installation try to build a BibTeX database and test it by creating a debian.pdf file
-    if isfile('/usr/bin/pdflatex') and access('/usr/bin/pdflatex', X_OK) and \
-       isfile('/usr/bin/bibtex')   and access('/usr/bin/bibtex', X_OK) and \
-       ( isfile('/usr/share/texlive/texmf-dist/fonts/source/jknappen/ec/ecrm.mf') or \
-         isfile('/usr/share/texmf-texlive/fonts/source/jknappen/ec/ecrm.mf') ) :
-      # create BibTeX file
-      bf = open(self.bibtexfile, 'w')
-      cur.execute("SELECT * FROM bibtex()")
-      for row in cur.fetchall():
-       print >>bf, row[0]
-      bf.close()
-
-      # create LaTeX file to test BibTeX functionality
-      bf = open(self.bibtex_example_tex, 'w')
-      print >>bf, """\documentclass[10]{article}
-\usepackage[T1]{fontenc}
-\usepackage[utf8]{inputenc}
-\usepackage[left=2mm,top=2mm,right=2mm,bottom=2mm,nohead,nofoot]{geometry}
-\usepackage{longtable}
-\usepackage[super]{natbib}
-\setlongtables
-\\begin{document}
-\small
-\\begin{longtable}{llp{70mm}l}
-\\bf package & \\bf source & \\bf description & BibTeX key \\\\ \hline"""
-
-      cur.execute("SELECT * FROM bibtex_example_data() AS (package text, source text, bibkey text, description text)")
-      for row in cur.fetchall():
-       print >>bf, row[0], '&', row[1], '&', row[3] , '&', row[2]+'\cite{'+row[2]+'} \\\\'
-
-      print >>bf, """\end{longtable}
-
-% \\bibliographystyle{plain}
-% Try a bit harder by also including URL+DOI
-\\bibliographystyle{plainnat}
-\\bibliography{debian}
-
-\end{document}
-"""
-      bf.close()
-
-      # try to build debian.pdf file to test aboc LaTeX file
-      basetexfile = self.bibtex_example_tex.replace('.tex','')
-      cleanup_tex_logs(basetexfile)
-      try:
-        rename(basetexfile+'.pdf', basetexfile+'.pdf~')
-      except OSError:
-        pass
-
-      (retcode,errstring) = open_tex_process('pdflatex', basetexfile)
-      if not retcode:
-        self.log.error("Problem in 1. PdfLaTeX run of %s.tex: `%s` --> please inspect %s.log" % (basetexfile, errstring, basetexfile))
-        exit(1)
-      (retcode,errstring) = open_tex_process('bibtex', basetexfile)
-      if errstring != "":
-        if not retcode:
-          self.log.error("Problem in BibTeX run of %s.bib: `%s`" % (basetexfile, errstring))
-          exit(1)
-        self.log.error("Ignore the following problems in BibTeX run of %s.bib: `%s`" % (basetexfile, errstring))
-      (retcode,errstring) = open_tex_process('pdflatex', basetexfile)
-      if not retcode:
-        self.log.error("Problem in 2. PdfLaTeX run of %s.tex: `%s` --> please inspect %s.log" % (basetexfile, errstring, basetexfile))
-        exit(1)
-      (retcode,errstring) = open_tex_process('pdflatex', basetexfile)
-      if not retcode:
-        self.log.error("Problem in 3. PdfLaTeX run of %s.tex: `%s` --> please inspect %s.log" % (basetexfile, errstring, basetexfile))
-        exit(1)
-
-      cleanup_tex_logs(basetexfile)
 
+    g = generate_bib_tex()
+    g.run(cur)


Do I need to generate the debian.{bib.tex} files in bibref_gatherer ? For now I have called generate_bib_tex(), but the files generated will always consist of outdated references. 
 
 if __name__ == '__main__':
   main()
 
diff --git a/udd/blends_prospective_gatherer.py b/udd/blends_prospective_gatherer.py
index a130bb1..32a6505 100644
--- a/udd/blends_prospective_gatherer.py
+++ b/udd/blends_prospective_gatherer.py
@@ -19,6 +19,7 @@ from debian import deb822
 import email.Utils

 from bibref_gatherer import upstream_reader
+from generate_bib_tex import generate_bib_tex

 debug=0

@@ -434,6 +435,9 @@ class blends_prospective_gatherer(gatherer):
     cur.execute("DEALLOCATE bibref_insert")

     cur.execute("ANALYZE %s" % my_config['table'])
+
+    g = generate_bib_tex()
+    g.run(cur)


I think calling generate_bib_tex() in blends_prospective gatherer after the references from VCS have been inserted in bibref table, is the expected solution to the issue at hand. Am I right ?
 
 if __name__ == '__main__':
   main()
 
Below is generate_bib_tex.py file:

diff --git a/udd/generate_bib_tex.py b/udd/generate_bib_tex.py
new file mode 100644
index 0000000..d93b898
--- /dev/null
+++ b/udd/generate_bib_tex.py
@@ -0,0 +1,226 @@
+from os import unlink, rename, access, X_OK
+from os.path import isfile
+from subprocess import Popen, PIPE
+import logging
+import logging.handlers
+
+debug = 0
+
+def rm_f(file):
+  try:
+    unlink(file)
+  except OSError:
+    pass
+
+
+def cleanup_tex_logs(basetexfile):
+  rm_f(basetexfile+'.aux')
+  rm_f(basetexfile+'.bbl')
+  rm_f(basetexfile+'.blg')
+  rm_f(basetexfile+'.log')
+
+
+def open_tex_process(texexe, basetexfile):
+  if texexe == 'pdflatex':
+    ptex = Popen(['pdflatex', '-interaction=batchmode', basetexfile], shell=False, stdout=PIPE)
+  elif texexe == 'bibtex':
+    ptex = Popen(['bibtex', basetexfile], shell=False, stdout=PIPE)
+  else:
+    return(False, 'Wrong exe: '+texexe)
+  errstring=""
+  if ptex.wait():
+    if texexe == 'pdflatex':
+      for logrow in ptex.communicate()[0].splitlines():
+        if logrow.startswith('!'):
+          errstring += logrow
+      return(False, errstring)
+    else:
+      for logrow in ptex.communicate()[0].splitlines():
+        if logrow.startswith('This is BibTeX'):
+          continue
+        errstring += logrow + '\n'
+      return(True, errstring)
+  return(True, errstring)
+
+

This creates a class generate_bib_tex(). Is it a good idea to create a class or should I define methods only ?
 
+class generate_bib_tex():
+  """
+  Generate a debian.bib and debian.tex files
+  """
+
+  def __init__(self):
+    self.log = logging.getLogger(self.__class__.__name__)
+    if debug==1:
+        self.log.setLevel(logging.DEBUG)
+    else:
+        self.log.setLevel(logging.INFO)
+    handler = logging.handlers.RotatingFileHandler(filename=self.__class__.__name__+'.log',mode='w')
+    formatter = logging.Formatter("%(asctime)s - %(levelname)s - (%(lineno)d): %(message)s")
+    handler.setFormatter(formatter)
+    self.log.addHandler(handler)
+
+    self.bibtexfile = 'debian.bib'
+    self.bibtex_example_tex = 'debian.tex'
+    self.all_ref = 1
+

If self.all_ref = 1
   references for all the sources from bibref table will be included (irrespective of the fact whether or not they are included Debian).
else : 
   only references for sources which are both in VCS and Debian packages will be included. By default self.all_ref = 0
 
+  def run(self, cur):
+
+    # if there is a working LaTeX installation try to build a BibTeX database and test it by creating a debian.pdf file
+    if isfile('/usr/bin/pdflatex') and access('/usr/bin/pdflatex', X_OK) and \
+       isfile('/usr/bin/bibtex')   and access('/usr/bin/bibtex', X_OK) and \
+       ( isfile('/usr/share/texlive/texmf-dist/fonts/source/jknappen/ec/ecrm.mf') or \
+         isfile('/usr/share/texmf-texlive/fonts/source/jknappen/ec/ecrm.mf') ) :
+
+      # create BibTeX file
+      bf = open(self.bibtexfile, 'w')
+
+      if self.all_ref == 1:
+        query = "SELECT * FROM bibtex()"

This includes refrences from all the sources in bibref table, by making use of bibtex() from UDD. However, '#' is not escaped here. How do I change that ? Do I make the changes in bibtex() of UDD itself ?

Below is the default part which is similar to bibtex() of UDD but performs an inner join and escapes '#'. Is it better to include this in UDD itself ?

+      else:
+       query = """ SELECT DISTINCT
+                        CASE WHEN bibjournal.value IS NULL AND bibin.value IS NOT NULL AND bibpublisher.value IS NOT NULL THEN '@Book{' || bibkey.value
+                            ELSE CASE WHEN bibauthor.value IS NULL OR bibjournal.value IS NULL THEN '@Misc{'|| bibkey.value ||
+                                 CASE WHEN bibauthor.value IS NULL THEN E',\n  Key     = "' || bibkey.value || '"' ELSE '' END -- without author we need a sorting key
+                            ELSE '@Article{' || bibkey.value END END  ||
+                        CASE WHEN bibauthor.value  IS NOT NULL THEN E',\n  Author  = {' || bibauthor.value  || '}' ELSE '' END ||
+                        CASE WHEN bibtitle.value   IS NOT NULL THEN E',\n  Title   = "{' ||
+                          replace(replace(replace(bibtitle.value,
+                                          '#', E'\\#'),            --
+                                          '_', E'\\_'),            --
+                                          '%', E'\\%'),            --
+                                          E'\xe2\x80\x89', E'\\,') -- TeX syntax for '_' and UTF-8 "thin space"
+                                          -- see http://www.utf8-chartable.de/unicode-utf8-table.pl?start=8192&number=128&utf8=string-literal
+                                  || '}"'
+                        ELSE '' END ||
+                        CASE WHEN bibbooktitle.value IS NOT NULL THEN E',\n  Booktitle = "{' || bibbooktitle.value || '}"' ELSE '' END ||
+                        CASE WHEN bibyear.value    IS NOT NULL THEN E',\n  Year    = {' || bibyear.value    || '}' ELSE '' END ||
+                        CASE WHEN bibmonth.value   IS NOT NULL THEN E',\n  Month   = {' || bibmonth.value   || '}' ELSE '' END ||
+                        CASE WHEN bibjournal.value IS NOT NULL THEN E',\n  Journal = {' || replace(bibjournal.value, '&', E'\\&') || '}' ELSE '' END ||
+                        CASE WHEN bibaddress.value IS NOT NULL THEN E',\n  Address = {' || bibaddress.value || '}' ELSE '' END ||
+                        CASE WHEN bibpublisher.value IS NOT NULL THEN E',\n  Publisher = {' || bibpublisher.value || '}' ELSE '' END ||
+                        CASE WHEN bibvolume.value  IS NOT NULL THEN E',\n  Volume  = {' || bibvolume.value  || '}' ELSE '' END ||
+                        CASE WHEN bibnumber.value  IS NOT NULL THEN E',\n  Number  = {' || bibnumber.value  || '}' ELSE '' END ||
+                        CASE WHEN bibpages.value   IS NOT NULL THEN E',\n  Pages   = {' || regexp_replace(bibpages.value, E'(\\d)-([\\d])', E'\\1--\\2')   || '}' ELSE '' END ||
+                        CASE WHEN biburl.value     IS NOT NULL THEN E',\n  URL     = {' ||
+                          replace(replace(replace(replace(biburl.value,
+                                          '#', E'\\#'),           --
+                                          '_', E'\\_'),           --
+                                          '%', E'\\%'),           --
+                                          '&', E'\\&'),           --
+                                          '~', E'\\~{}')          --
+                                  || '}'
+                        ELSE '' END ||
+                        CASE WHEN bibdoi.value     IS NOT NULL THEN E',\n  DOI     = {' ||
+                          replace(replace(bibdoi.value,
+                                          '#', E'\\#'),            --
+                                          '_', E'\\_'),           --
+                                          '&', E'\\&')            --
+                                  || '}'
+                        ELSE '' END ||
+                        CASE WHEN bibpmid.value    IS NOT NULL THEN E',\n  PMID    = {' || bibpmid.value    || '}' ELSE '' END ||
+                        CASE WHEN bibeprint.value  IS NOT NULL THEN E',\n  EPrint  = {' ||
+                          replace(replace(replace(replace(bibeprint.value,
+                                         '#', E'\\#'),            --
+                                         '_', E'\\_'),           --
+                                         '%', E'\\%'),           --
+                                         '&', E'\\&'),           --
+                                         '~', E'\\~{}')          --
+                                  || '}'
+                        ELSE '' END ||
+                        CASE WHEN bibin.value      IS NOT NULL THEN E',\n  In      = {' || bibin.value      || '}' ELSE '' END ||
+                        CASE WHEN bibissn.value    IS NOT NULL THEN E',\n  ISSN    = {' || bibissn.value    || '}' ELSE '' END ||
+                        E',\n}\n'
+                        AS bibentry
+                        --         p.source         AS source,
+                        --         p.rank           AS rank,
+                FROM (SELECT DISTINCT source, package, rank FROM bibref) p
 
+                INNER JOIN sources s ON s.source = p.source

This is the  INNER JOIN performed to ensure that references of packages that are both in VCS and in Debian are the only ones that are included in the bibtex file that is created.

+                LEFT OUTER JOIN bibref bibkey     ON p.source = bibkey.source     AND bibkey.rank     = p.rank AND bibkey.package     = p.package AND bibkey.key     = 'bibtex'
+                LEFT OUTER JOIN bibref bibyear    ON p.source = bibyear.source    AND bibyear.rank    = p.rank AND bibyear.package    = p.package AND bibyear.key    = 'year'
+                LEFT OUTER JOIN bibref bibmonth   ON p.source = bibmonth.source   AND bibmonth.rank   = p.rank AND bibmonth.package   = p.package AND bibmonth.key   = 'month'
+                LEFT OUTER JOIN bibref bibtitle   ON p.source = bibtitle.source   AND bibtitle.rank   = p.rank AND bibtitle.package   = p.package AND bibtitle.key   = 'title'
+                LEFT OUTER JOIN bibref bibbooktitle ON p.source = bibbooktitle.source AND bibbooktitle.rank = p.rank AND bibbooktitle.package = p.package AND bibbooktitle.key = 'booktitle'
+                LEFT OUTER JOIN bibref bibauthor  ON p.source = bibauthor.source  AND bibauthor.rank  = p.rank AND bibauthor.package  = p.package AND bibauthor.key  = 'author'
+                LEFT OUTER JOIN bibref bibjournal ON p.source = bibjournal.source AND bibjournal.rank = p.rank AND bibjournal.package = p.package AND bibjournal.key = 'journal'
+                LEFT OUTER JOIN bibref bibaddress ON p.source = bibaddress.source AND bibaddress.rank = p.rank AND bibaddress.package = p.package AND bibaddress.key = 'address'
+                LEFT OUTER JOIN bibref bibpublisher ON p.source = bibpublisher.source AND bibpublisher.rank = p.rank AND bibpublisher.package = p.package AND bibpublisher.key = 'publisher'
+                LEFT OUTER JOIN bibref bibvolume  ON p.source = bibvolume.source  AND bibvolume.rank  = p.rank AND bibvolume.package  = p.package AND bibvolume.key  = 'volume'
+                LEFT OUTER JOIN bibref bibdoi     ON p.source = bibdoi.source     AND bibdoi.rank     = p.rank AND bibdoi.package     = p.package AND bibdoi.key     = 'doi'
+                LEFT OUTER JOIN bibref bibpmid    ON p.source = bibpmid.source    AND bibpmid.rank    = p.rank AND bibpmid.package    = p.package AND bibpmid.key    = 'pmid'LEFT OUTER JOIN bibref biburl     ON p.source = biburl.source     AND biburl.rank     = p.rank AND biburl.package     = p.package AND biburl.key     = 'url'
+                LEFT OUTER JOIN bibref bibnumber  ON p.source = bibnumber.source  AND bibnumber.rank  = p.rank AND bibnumber.package  = p.package AND bibnumber.key  = 'number'
+                LEFT OUTER JOIN bibref bibpages   ON p.source = bibpages.source   AND bibpages.rank   = p.rank AND bibpages.package   = p.package AND bibpages.key   = 'pages'
+                LEFT OUTER JOIN bibref bibeprint  ON p.source = bibeprint.source  AND bibeprint.rank  = p.rank AND bibeprint.package  = p.package AND bibeprint.key  = 'eprint'
+                LEFT OUTER JOIN bibref bibin      ON p.source = bibin.source      AND bibin.rank      = p.rank AND bibin.package      = p.package AND bibin.key      = 'in'
+                LEFT OUTER JOIN bibref bibissn    ON p.source = bibissn.source    AND bibissn.rank    = p.rank AND bibissn.package    = p.package AND bibissn.key    = 'issn'
+                ORDER BY bibentry -- p.source
+                ;"""
+
+      cur.execute(query)
+      for row in cur.fetchall():
+          print >>bf, row[0]
+
+      bf.close()
+
+      # create LaTeX file to test BibTeX functionality
+      bf = open(self.bibtex_example_tex, 'w')
+      print >>bf, """\documentclass[10]{article}
+\usepackage[T1]{fontenc}
+\usepackage[utf8]{inputenc}
+\usepackage[left=2mm,top=2mm,right=2mm,bottom=2mm,nohead,nofoot]{geometry}
+\usepackage{longtable}
+\usepackage[super]{natbib}
+\setlongtables
+\\begin{document}
+\small
+\\begin{longtable}{llp{70mm}l}
+\\bf package & \\bf source & \\bf description & BibTeX key \\\\ \hline"""
+
+      cur.execute("SELECT * FROM bibtex_example_data() AS (package text, source text, bibkey text, description text)")
+      for row in cur.fetchall():
+       print >>bf, row[0], '&', row[1], '&', row[3] , '&', row[2]+'\cite{'+row[2]+'} \\\\'
+
+      print >>bf, """\end{longtable}
+
+% \\bibliographystyle{plain}
+% Try a bit harder by also including URL+DOI
+\\bibliographystyle{plainnat}
+\\bibliography{debian}
+
+\end{document}
+"""
+      bf.close()
+
+      # try to build debian.pdf file to test aboc LaTeX file
+      basetexfile = self.bibtex_example_tex.replace('.tex','')
+      cleanup_tex_logs(basetexfile)
+      try:
+        rename(basetexfile+'.pdf', basetexfile+'.pdf~')
+      except OSError:
+        pass
+
+      (retcode,errstring) = open_tex_process('pdflatex', basetexfile)
+      if not retcode:
+        self.log.error("Problem in 1. PdfLaTeX run of %s.tex: `%s` --> please inspect %s.log" % (basetexfile, errstring, basetexfile))
+        exit(1)
+
+      (retcode,errstring) = open_tex_process('bibtex', basetexfile)
+      if errstring != "":
+        if not retcode:
+          self.log.error("Problem in BibTeX run of %s.bib: `%s`" % (basetexfile, errstring))
+          exit(1)
+        self.log.error("Ignore the following problems in BibTeX run of %s.bib: `%s`" % (basetexfile, errstring))
+
+      (retcode,errstring) = open_tex_process('pdflatex', basetexfile)
+      if not retcode:
+        self.log.error("Problem in 2. PdfLaTeX run of %s.tex: `%s` --> please inspect %s.log" % (basetexfile, errstring, basetexfile))
+        exit(1)
+
+      (retcode,errstring) = open_tex_process('pdflatex', basetexfile)
+      if not retcode:
+        self.log.error("Problem in 3. PdfLaTeX run of %s.tex: `%s` --> please inspect %s.log" % (basetexfile, errstring, basetexfile))
+        exit(1)
+
+      cleanup_tex_logs(basetexfile)
+
+
+
--
1.9.1


I have checked the differences in the debian.bib and debian.tex files. They are as expected.

 -> Without these changes, the references are outdated. 
 -> When self.all_ref = 1, then there are many more references.
 -> When self.all_ref = 0, then the number of references are greater than the older references but less than when self.all_ref = 1.

Also, is there a possibility that the references which have been injected by bibref_gatherer are updated in blends_prospective_gatherer. I had not found any Updates when I had written the "Upsert" functionality, but if there is a possibility, then I think we should include Upsert also in blends_prospective_gatherer and then generate the debian.bib and debian.tex files.


-- 
Regards,
Akshita Jha

Reply to: