From Ivory Guinea Pig, 2 Months ago, written in Python.
This paste is a reply to Re: Re: Re: Untitled from Gamboge Stork
- view diff
Embed
  1. #    VoidDomain FB2 converter
  2. #    Copyright © 2019 Anonymous
  3. #
  4. #    This program is free software: you can redistribute it and/or modify
  5. #    it under the terms of the GNU Affero General Public License as
  6. #    published by the Free Software Foundation, either version 3 of the
  7. #    License, or (at your option) any later version.
  8. #
  9. #    This program is distributed in the hope that it will be useful,
  10. #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12. #    GNU Affero General Public License for more details.
  13. #
  14. #    You should have received a copy of the GNU Affero General Public License
  15. #    along with this program.  If not, see <https://www.gnu.org/licenses/>.
  16.  
  17. import os
  18. import sys
  19. import urllib.request
  20. import argparse
  21. from enum import Enum
  22. from collections import OrderedDict
  23. from html.parser import HTMLParser
  24. import re
  25. import datetime
  26. import getpass
  27. import uuid
  28. import traceback
  29. from xml.sax.saxutils import escape
  30.  
  31. index_url = "https://towercurator.wordpress.com/index/"
  32.  
  33. genre_xml = """<genre match="100">sf_fantasy</genre>
  34. """
  35.  
  36. author_xml = """<author>
  37. <nickname>TowerCurator</nickname>
  38. </author>
  39. """
  40.  
  41. annotation_xml = """<annotation>
  42. <p>Brakket Magical Academy in the Northwestern United States is on its last legs. Enrollment of new students is at an all time low. The academy instructors go out to recruit prospective children other magical academies have ignored.</p>
  43. <p>Eva is one such recruit. After witnessing her perform magic no teenage mage should have learned, an instructor of the academy offers her a full ride scholarship. Eva does not turn down the opportunity to learn magic in a proper capacity and quickly ships out to Montana.</p>
  44. <p>Barely a day there and things already seem off. Every student has the same scholarship, odd-smelling men wander the town, and a spider demon has decided Eva’s dorm room is the place to be.</p>
  45. </annotation>
  46. """
  47.  
  48. misc_book_xml = """<lang>en</lang>
  49. <src-lang>en</src-lang>
  50. """
  51.  
  52. doc_author_xml = """<first-name></first-name>
  53. <middle-name></middle-name>
  54. <last-name></last-name>
  55. <nickname>{}</nickname>
  56. <email></email>
  57. """.format (getpass.getuser ())
  58.  
  59. program_xml = """<program-used>VoidDomain conversion script v1.2</program-used>
  60. """
  61.  
  62. doc_version_xml = """<version>1.2</version>
  63. """
  64.  
  65. publisher_xml = """<publisher>TowerCurator</publisher>
  66. """
  67.  
  68. namespace_uuid = uuid.UUID (hex='d3d25492-cf0b-11e9-be60-1c6f65d74be8')
  69.  
  70. buffer_length = 1024
  71.  
  72. class Dummy (object):
  73.   pass
  74.  
  75. def fetch_page (filename, url, force=False):
  76.   if not force \
  77.       and os.path.exists (filename) \
  78.       and os.path.getsize (filename) > 0:
  79.     return
  80.   print ("Downloading {}".format (filename))
  81.   request = urllib.request.Request (url=url)
  82.   i = 0
  83.   with urllib.request.urlopen (request) as page_contents:
  84.     with open (filename, 'wb') as f:
  85.       while True:
  86.         contents = page_contents.read (buffer_length)
  87.         if len (contents) > 0:
  88.           f.write (contents)
  89.         else:
  90.           break
  91.  
  92. class VoidDomainIndexParser (HTMLParser):
  93.   def __init__ (self):
  94.     super ().__init__ ()
  95.     self.found_entry = False
  96.     self.found_misc = False
  97.     self.post_a = False
  98.     self.last_href = None
  99.     self.last_data = []
  100.     self.stack = list ()
  101.     self.structure = list ()
  102.  
  103.   def handle_starttag (self, tag, attrs):
  104.     self.last_data = "".join (self.last_data)
  105.     if tag == 'br' \
  106.         and self.post_a \
  107.         and len (self.last_data) > 0 \
  108.         and self.found_entry \
  109.         and not self.found_misc:
  110.       if self.last_data[0] == u"\u2013":
  111.         self.last_data = self.last_data[1:]
  112.       self.structure[-1][2][-1].append (self.last_data)
  113.       self.post_a = False
  114.     self.stack.append (tag)
  115.     if tag == "div" \
  116.         and not self.found_entry \
  117.         and ('class', 'entry') in attrs:
  118.       self.found_entry = True
  119.     elif tag == 'a':
  120.       self.last_href = None
  121.       for attr in attrs:
  122.         if attr[0] == 'href':
  123.           self.last_href = attr[1]
  124.           break
  125.     elif self.stack[-3:] == ['p', 'strong', 'br'] \
  126.         and self.found_entry \
  127.         and not self.found_misc \
  128.         and self.last_data[:5] == "Book ":
  129.       m = re.match (r'Book ([0-9]+).*', self.last_data)
  130.       if m:
  131.         self.structure.append (['book', int (m.group (1)), list ()])
  132.     self.last_data = []
  133.  
  134.   def handle_endtag (self, tag):
  135.     last_data = "".join (self.last_data)
  136.     if not self.found_misc \
  137.         and self.stack[-3:] == ['div', 'p', 'strong'] \
  138.         and last_data == "Miscellaneous":
  139.       self.found_misc = True
  140.     elif self.stack[-3:] == ['div', 'p', 'strong'] \
  141.         and self.found_entry \
  142.         and not self.found_misc \
  143.         and last_data[:5] == "Book ":
  144.       m = re.match (r'Book ([0-9]+).*', last_data)
  145.       if m:
  146.         self.structure.append (['book', int (m.group (1)), list ()])
  147.     elif self.stack[-3:] == ['div', 'p', 'a'] \
  148.         and self.found_entry \
  149.         and not self.found_misc \
  150.         and self.last_href is not None:
  151.       if last_data[:2] == ">>":
  152.         last_data = last_data[2:]
  153.       self.structure[-1][2].append (['chapter', self.last_href, last_data])
  154.       last_data = ''
  155.       self.post_a = True
  156.     elif tag == 'p' and \
  157.         self.post_a \
  158.         and len (self.last_data) > 0 \
  159.         and self.found_entry \
  160.         and not self.found_misc:
  161.       if len (last_data) > 0 and last_data[0] == u"\u2013":
  162.         last_data = last_data[1:]
  163.       self.structure[-1][2][-1].append (last_data)
  164.       self.post_a = False
  165.     self.stack = self.stack[:-1]
  166.     if not self.post_a:
  167.       self.last_data = []
  168.     else:
  169.       self.last_data = [last_data]
  170.  
  171.   def handle_data (self, data):
  172.     self.last_data.append (data)
  173.  
  174. class VoidDomainChapterParser (HTMLParser):
  175.   def __init__ (self):
  176.     super ().__init__ ()
  177.     self.found_entry = False
  178.     self.jp_post_flair = False
  179.     self.last_data = []
  180.     self.stack = list ()
  181.     self.structure = Dummy ()
  182.     self.structure.t = 'top'
  183.     self.structure.first_child = None
  184.     self.structure.last_child = None
  185.     self.structure.parent = None
  186.     self.structure.next = None
  187.     self.structure_head = self.structure
  188.     self.structure.post_date = None
  189.     self.structure.mod_date = None
  190.     self.ul_depth = 0
  191.  
  192.   def break_paragraph (self):
  193.     c = self.structure_head
  194.     p = None
  195.     while c is not None:
  196.       if c.t == 'para':
  197.         p = c
  198.         break
  199.       c = c.parent
  200.  
  201.     if p is None:
  202.       return
  203.  
  204.     stack = list ()
  205.     c = self.structure_head
  206.     while c is not None:
  207.       stack.append (c)
  208.       if p is c:
  209.         break
  210.       c = c.parent
  211.  
  212.     for item in stack:
  213.       self.close_tag ()
  214.  
  215.     for item in reversed (stack):
  216.       self.new_child_copy (item)
  217.  
  218.   def new_child_tag (self, tag, attrs):
  219.     if tag == 'p':
  220.       self.new_child ('para')
  221.     elif tag == 'i' or tag == 'em':
  222.       self.new_child ('i')
  223.     elif tag == 'b' or tag == 'strong':
  224.       self.new_child ('str')
  225.     elif tag == "span" \
  226.         and (('style', 'font-size:xx-small;') in attrs \
  227.              or ('style', 'font-size:small;') in attrs):
  228.       self.new_child ('sub')
  229.     elif tag == "span" \
  230.         and ('style', 'font-variant:small-caps;') in attrs:
  231.       self.new_child ('code')
  232.     elif tag == 'a' and 'href' in [x[0] for x in attrs]:
  233.       self.new_child ('a', extra=next ((x for x in attrs if x[0] == 'href'))[1])
  234.     elif tag == 'del':
  235.       self.new_child ('del')
  236.     elif tag == 'ul':
  237.       if self.ul_depth == 0:
  238.         self.new_child ('para')
  239.       else:
  240.         self.new_child ('ul')
  241.       self.ul_depth += 1
  242.     else:
  243.       raise Exception("unhandled tag {} with attrs {}".format (tag, attrs))
  244.  
  245.   def handle_starttag (self, tag, attrs):
  246.     self.last_data = "".join (self.last_data)
  247.     self.stack.append ([tag, attrs])
  248.     if tag == "div" \
  249.         and not self.found_entry \
  250.         and ('class', 'entry') in attrs:
  251.       self.found_entry = True
  252.     elif self.found_entry \
  253.         and not self.jp_post_flair \
  254.         and tag == "div" \
  255.         and ('id', 'jp-post-flair') in attrs:
  256.       self.jp_post_flair = True
  257.     elif self.found_entry and not self.jp_post_flair:
  258.       if tag == 'br':
  259.         self.break_paragraph ()
  260.       elif tag == 'li':
  261.         self.break_paragraph ()
  262.         self.new_child ('data', '\u2022' * self.ul_depth + ' ')
  263.       else:
  264.         self.new_child_tag (tag, attrs)
  265.  
  266.     self.last_data = []
  267.  
  268.   def handle_endtag (self, tag):
  269.     last_data = "".join (self.last_data)
  270.     if self.found_entry and not self.jp_post_flair and tag != 'li':
  271.       if tag == 'ul':
  272.         self.ul_depth -= 1
  273.       self.close_tag ()
  274.     self.stack = self.stack[:-1]
  275.  
  276.   def handle_startendtag (self, tag, attrs):
  277.     last_data = "".join (self.last_data)
  278.     if tag == "meta" \
  279.         and ('property', 'article:published_time') in attrs:
  280.       self.structure.post_date = datetime.datetime.fromisoformat (next ((x for x in attrs if x[0] == 'content'))[1])
  281.     elif tag == "meta" \
  282.         and ('property', 'article:modified_time') in attrs:
  283.       self.structure.mod_date = datetime.datetime.fromisoformat (next ((x for x in attrs if x[0] == 'content'))[1])
  284.     if self.found_entry and not self.jp_post_flair:
  285.       if tag == 'br':
  286.         self.break_paragraph ()
  287.       else:
  288.         raise Exception("unhandled startend tag {} with attrs {}".format (tag, attrs))
  289.  
  290.   def handle_data (self, data):
  291.     self.last_data.append (data)
  292.     if not self.found_entry or self.jp_post_flair:
  293.       return
  294.     self.new_child ('data', data)
  295.  
  296.   def close_tag (self):
  297.     self.structure_head = self.structure_head.parent
  298.  
  299.   def new_child_copy (self, orig):
  300.     data = None
  301.     extra = None
  302.     if orig.t == 'data':
  303.       data = orig.data
  304.     if orig.t == 'a':
  305.       extra = orig.href
  306.     self.new_child (orig.t, data, extra)
  307.  
  308.   def new_child (self, t, data=None, extra=None):
  309.     if self.structure_head is None:
  310.       return
  311.     c = Dummy ()
  312.     c.t = t
  313.     c.first_child = None
  314.     c.last_child = None
  315.     c.parent = self.structure_head
  316.     c.next = None
  317.     if self.structure_head.last_child is not None:
  318.       self.structure_head.last_child.next = c
  319.     self.structure_head.last_child = c
  320.     if self.structure_head.first_child is None:
  321.       self.structure_head.first_child = c
  322.     if t != 'data':
  323.       self.structure_head = c
  324.     else:
  325.       c.data = data
  326.     if t == 'a':
  327.       c.href = extra
  328.  
  329. def parse_html (filename, parser):
  330.   with open (filename, 'rb') as fd:
  331.     buffer = []
  332.     while True:
  333.       b = fd.read (buffer_length)
  334.       if len (b) > 0:
  335.         buffer.append (b)
  336.       try:
  337.         text = b"".join (buffer).decode ('utf-8')
  338.       except:
  339.         pass
  340.       else:
  341.         buffer = []
  342.         parser.feed (text)
  343.       if len (b) <= 0:
  344.         break
  345.   return parser.structure
  346.  
  347. def pu (f, s):
  348.   f.write (s.encode ("utf-8"))
  349.  
  350. def output_fb2 (book, outdir):
  351.   title = "Void Domain, Book {:02d}".format (book.number)
  352.   filename = os.path.join (outdir, "{}.fb2".format (title))
  353.   print ("Writing {}".format (filename))
  354.   with open (filename, 'wb') as outfile:
  355.     pu (outfile, \
  356. """<?xml version="1.0" encoding="utf-8" ?>
  357. <FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:l="http://www.w3.org/1999/xlink">
  358. <description>
  359. <title-info>
  360. """)
  361.     pu (outfile, genre_xml)
  362.     pu (outfile, author_xml)
  363.     pu (outfile, \
  364. """<book-title>{}</book-title>
  365. """.format (escape (title)))
  366.     if book.number == 1:
  367.       pu (outfile, annotation_xml)
  368.     pu (outfile, \
  369. """<date value="{}">{}</date>
  370. """.format (book.post.strftime ("%Y-%m-%d"), book.post.strftime ("%Y")))
  371.     pu (outfile, misc_book_xml)
  372.     pu (outfile,
  373. """<sequence name="Void Domain" number="{}"/>
  374. """.format (book.number))
  375.     pu (outfile,
  376. """</title-info>
  377. <document-info>
  378. <author>
  379. """)
  380.     pu (outfile, doc_author_xml)
  381.     pu (outfile, \
  382. """</author>
  383. """)
  384.     pu (outfile, program_xml)
  385.     now = datetime.datetime.now ()
  386.     pu (outfile, \
  387. """<date value="{}">{}</date>
  388. """.format (now.strftime ("%Y-%m-%d"), now.strftime ("%d.%m.%Y")))
  389.     guid = uuid.uuid3 (namespace_uuid, title)
  390.     pu (outfile, \
  391. """<id>{}</id>
  392. """.format (guid))
  393.     pu (outfile, doc_version_xml)
  394.     pu (outfile, \
  395. """</document-info>
  396. <publish-info>
  397. <book-name>{}</book-name>
  398. {}<year>{}</year>
  399. </publish-info>
  400. </description>
  401. <body>
  402. """.format (escape (title), publisher_xml, book.mod.strftime ("%Y")))
  403.     pu (outfile, \
  404. """<title>
  405. <p>{}</p>
  406. </title>
  407. """.format (title))
  408.     for chapter in book.chapters:
  409.       pu (outfile, \
  410. """<section>
  411. <title>
  412. <p><strong>{}</strong></p>
  413. </title>
  414. """.format (escape (chapter.title)))
  415.       if chapter.subtitle is not None:
  416.         pu (outfile, \
  417. """<subtitle>
  418. <strong>{}</strong>
  419. </subtitle>
  420. """.format (escape (chapter.subtitle)))
  421.       output_tags (outfile, chapter.structure)
  422.       pu (outfile, \
  423. """</section>
  424. """)
  425.     pu (outfile, \
  426. """</body>
  427. </FictionBook>
  428. """)
  429.  
  430. def children_as_list (node):
  431.   c = node.first_child
  432.   children = list ()
  433.   while c is not None:
  434.     children.append (c)
  435.     c = c.next
  436.   return children
  437.  
  438.  
  439. def skip_paragraph (node):
  440.   children = children_as_list (node)
  441.   if len (children) >= 5 and \
  442.       children[0].t in ['a', 'del'] and \
  443.       children[1].t == 'data' and \
  444.       children[2].t == 'a' and \
  445.       children[3].t == 'data' and \
  446.       children[4].t in ['a', 'del'] and \
  447.       '|' in children[1].data and \
  448.       '|' in children[3].data:
  449.       return True
  450.   elif len (children) >= 3 and \
  451.       children[0].t in ['a', 'del'] and \
  452.       children[1].t == 'data' and \
  453.       children[2].t == 'a' and \
  454.       '|' in children[1].data:
  455.       return True
  456.  
  457.   return False
  458.  
  459. def skip_href (node):
  460.   children = children_as_list (node)
  461.   if len (children) >= 1 and \
  462.       children[0].t == 'data':
  463.     if '>>' in children[0].data and '<<' in children[0].data:
  464.       return True
  465.  
  466.   return False
  467.  
  468. def output_tags_recursive (outfile, node, inside_para):
  469.   if node.t == 'data':
  470.     pu (outfile, escape (node.data))
  471.     return
  472.  
  473.   if node.t == 'para':
  474.     if skip_paragraph (node):
  475.       return
  476.     if inside_para[0]:
  477.       raise Exception ("Nested <p> tags")
  478.     inside_para[0] = True
  479.     pu (outfile, "<p>")
  480.   elif node.t == 'str':
  481.     pu (outfile, "<strong>")
  482.   elif node.t == 'i':
  483.     pu (outfile, "<emphasis>")
  484.   elif node.t == 'del':
  485.     pu (outfile, "<strikethrough>")
  486.   elif node.t == 'sub':
  487.     pu (outfile, "<sub>")
  488.   elif node.t == 'code':
  489.     pu (outfile, "<code>")
  490.   elif node.t == 'a':
  491.     if skip_href (node):
  492.       return
  493.     pu (outfile, """<a l:type="simple" l:href="{}">""".format (escape (node.href)))
  494.   elif node.t == 'ul':
  495.     pass
  496.   else:
  497.     raise Exception ("Unhandled node type {}".format (node.t))
  498.  
  499.   c = node.first_child
  500.   while c is not None:
  501.     output_tags_recursive (outfile, c, inside_para)
  502.     c = c.next
  503.  
  504.   if node.t == 'para':
  505.     pu (outfile, "</p>")
  506.     inside_para[0] = False
  507.   elif node.t == 'str':
  508.     pu (outfile, "</strong>")
  509.   elif node.t == 'i':
  510.     pu (outfile, "</emphasis>")
  511.   elif node.t == 'del':
  512.     pu (outfile, "</strikethrough>")
  513.   elif node.t == 'sub':
  514.     pu (outfile, "</sub>")
  515.   elif node.t == 'code':
  516.     pu (outfile, "</code>")
  517.   elif node.t == 'a':
  518.     pu (outfile, "</a>")
  519.  
  520. def output_tags (outfile, top):
  521.    c = top.first_child
  522.    while c is not None:
  523.      p = list ()
  524.      p.append (False)
  525.      output_tags_recursive (outfile, c, p)
  526.      c = c.next
  527.  
  528. def main ():
  529.   batch = False
  530.   try:
  531.     parser = argparse.ArgumentParser ()
  532.     parser.add_argument ("-w", "--work-dir", default=".",
  533.                          help="work directory for temporary files (default is %(default)s)")
  534.     parser.add_argument ("-f", "--force", action='store_true', default=False,
  535.                          help="re-download the HTML files even if they are already downloaded (default is %(default)s)")
  536.     parser.add_argument ("-b", "--batch-mode", action='store_true', default=False,
  537.                          help="no user interaction (default is %(default)s)")
  538.     parser.add_argument ("-o", "--output-dir", metavar="OUTDIR", default=".",
  539.                          help="the directory to write FB2 files into (default is %(default)s)")
  540.     args = parser.parse_args ()
  541.     batch = args.batch_mode
  542.     index_filename = os.path.join (args.work_dir, "index.html")
  543.     fetch_page (index_filename, index_url, args.force)
  544.     structure = parse_html (index_filename, VoidDomainIndexParser ())
  545.     books = list ()
  546.     for book_rec in structure:
  547.       chapter_recs = book_rec[2]
  548.       book = Dummy ()
  549.       book.number = book_rec[1]
  550.       book.chapters = list ()
  551.       books.append (book)
  552.       for chap_rec in chapter_recs:
  553.         chapter = Dummy ()
  554.         chapter.url = chap_rec[1]
  555.         chapter.title = chap_rec[2]
  556.         chapter.subtitle = chap_rec[3] if chap_rec[3] else None
  557.         filename = "book-{}-chapter-{}.html".format (book.number, chapter.title)
  558.         filename = os.path.join (args.work_dir, filename)
  559.         chapter.filename = filename
  560.         fetch_page (filename, chapter.url, args.force)
  561.         chapter.structure = parse_html (filename, VoidDomainChapterParser ())
  562.         book.chapters.append (chapter)
  563.       book.post = None
  564.       book.mod = None
  565.       for chapter in book.chapters:
  566.         if chapter.structure.post_date is not None and (book.post is None or chapter.structure.post_date > book.post):
  567.           book.post = chapter.structure.post_date
  568.         if chapter.structure.mod_date is not None and (book.mod is None or chapter.structure.mod_date > book.mod):
  569.           book.mod = chapter.structure.mod_date
  570.       output_fb2 (book, args.output_dir)
  571.   except Exception:
  572.     traceback.print_exc (file=sys.stderr)
  573.   finally:
  574.     sys.stderr.flush ()
  575.     sys.stdout.flush ()
  576.     if not batch:
  577.       input ("Press Enter to end the program")
  578.  
  579.   sys.exit (0)
  580.  
  581. if __name__ == '__main__':
  582.   main ()
  583.