From Gamboge Stork, 1 Month ago, written in Python.
This paste is a reply to Re: Re: Untitled from Queen Macaw
- view diff
Embed
  1.  
  2. #    VoidDomain FB2 converter
  3. #    Copyright © 2019 Anonymous
  4. #
  5. #    This program is free software: you can redistribute it and/or modify
  6. #    it under the terms of the GNU Affero General Public License as
  7. #    published by the Free Software Foundation, either version 3 of the
  8. #    License, or (at your option) any later version.
  9. #
  10. #    This program is distributed in the hope that it will be useful,
  11. #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13. #    GNU Affero General Public License for more details.
  14. #
  15. #    You should have received a copy of the GNU Affero General Public License
  16. #    along with this program.  If not, see <https://www.gnu.org/licenses/>.
  17.  
  18. import os
  19. import sys
  20. import urllib.request
  21. import argparse
  22. from enum import Enum
  23. from collections import OrderedDict
  24. from html.parser import HTMLParser
  25. import re
  26. import datetime
  27. import getpass
  28. import uuid
  29. import traceback
  30. from xml.sax.saxutils import escape
  31.  
  32. index_url = "https://towercurator.wordpress.com/index/"
  33.  
  34. genre_xml = """<genre match="100">sf_fantasy</genre>
  35. """
  36.  
  37. author_xml = """<author>
  38. <nickname>TowerCurator</nickname>
  39. </author>
  40. """
  41.  
  42. annotation_xml = """<annotation>
  43. <p>Brakket Magical Academy in the Northwestern United States is on its last legs. Enrollment of new students is at an all time low. The academy instructors go out to recruit prospective children other magical academies have ignored.</p>
  44. <p>Eva is one such recruit. After witnessing her perform magic no teenage mage should have learned, an instructor of the academy offers her a full ride scholarship. Eva does not turn down the opportunity to learn magic in a proper capacity and quickly ships out to Montana.</p>
  45. <p>Barely a day there and things already seem off. Every student has the same scholarship, odd-smelling men wander the town, and a spider demon has decided Eva’s dorm room is the place to be.</p>
  46. </annotation>
  47. """
  48.  
  49. misc_book_xml = """<lang>en</lang>
  50. <src-lang>en</src-lang>
  51. """
  52.  
  53. doc_author_xml = """<first-name></first-name>
  54. <middle-name></middle-name>
  55. <last-name></last-name>
  56. <nickname>{}</nickname>
  57. <email></email>
  58. """.format (getpass.getuser ())
  59.  
  60. program_xml = """<program-used>VoidDomain conversion script v1.1</program-used>
  61. """
  62.  
  63. doc_version_xml = """<version>1.1</version>
  64. """
  65.  
  66. publisher_xml = """<publisher>TowerCurator</publisher>
  67. """
  68.  
  69. namespace_uuid = uuid.UUID (hex='d3d25492-cf0b-11e9-be60-1c6f65d74be8')
  70.  
  71. buffer_length = 1024
  72.  
  73. class Dummy (object):
  74.   pass
  75.  
  76. def fetch_page (filename, url, force=False):
  77.   if not force \
  78.       and os.path.exists (filename) \
  79.       and os.path.getsize (filename) > 0:
  80.     return
  81.   print ("Downloading {}".format (filename))
  82.   request = urllib.request.Request (url=url)
  83.   i = 0
  84.   with urllib.request.urlopen (request) as page_contents:
  85.     with open (filename, 'wb') as f:
  86.       while True:
  87.         contents = page_contents.read (buffer_length)
  88.         if len (contents) > 0:
  89.           f.write (contents)
  90.         else:
  91.           break
  92.  
  93. class VoidDomainIndexParser (HTMLParser):
  94.   def __init__ (self):
  95.     super ().__init__ ()
  96.     self.found_entry = False
  97.     self.found_misc = False
  98.     self.post_a = False
  99.     self.last_href = None
  100.     self.last_data = []
  101.     self.stack = list ()
  102.     self.structure = list ()
  103.  
  104.   def handle_starttag (self, tag, attrs):
  105.     self.last_data = "".join (self.last_data)
  106.     if tag == 'br' \
  107.         and self.post_a \
  108.         and len (self.last_data) > 0 \
  109.         and self.found_entry \
  110.         and not self.found_misc:
  111.       if self.last_data[0] == u"\u2013":
  112.         self.last_data = self.last_data[1:]
  113.       self.structure[-1][2][-1].append (self.last_data)
  114.       self.post_a = False
  115.     self.stack.append (tag)
  116.     if tag == "div" \
  117.         and not self.found_entry \
  118.         and ('class', 'entry') in attrs:
  119.       self.found_entry = True
  120.     elif tag == 'a':
  121.       self.last_href = None
  122.       for attr in attrs:
  123.         if attr[0] == 'href':
  124.           self.last_href = attr[1]
  125.           break
  126.     elif self.stack[-3:] == ['p', 'strong', 'br'] \
  127.         and self.found_entry \
  128.         and not self.found_misc \
  129.         and self.last_data[:5] == "Book ":
  130.       m = re.match (r'Book ([0-9]+).*', self.last_data)
  131.       if m:
  132.         self.structure.append (['book', int (m.group (1)), list ()])
  133.     self.last_data = []
  134.  
  135.   def handle_endtag (self, tag):
  136.     last_data = "".join (self.last_data)
  137.     if not self.found_misc \
  138.         and self.stack[-3:] == ['div', 'p', 'strong'] \
  139.         and last_data == "Miscellaneous":
  140.       self.found_misc = True
  141.     elif self.stack[-3:] == ['div', 'p', 'strong'] \
  142.         and self.found_entry \
  143.         and not self.found_misc \
  144.         and last_data[:5] == "Book ":
  145.       m = re.match (r'Book ([0-9]+).*', last_data)
  146.       if m:
  147.         self.structure.append (['book', int (m.group (1)), list ()])
  148.     elif self.stack[-3:] == ['div', 'p', 'a'] \
  149.         and self.found_entry \
  150.         and not self.found_misc \
  151.         and self.last_href is not None:
  152.       if last_data[:2] == ">>":
  153.         last_data = last_data[2:]
  154.       self.structure[-1][2].append (['chapter', self.last_href, last_data])
  155.       last_data = ''
  156.       self.post_a = True
  157.     elif tag == 'p' and \
  158.         self.post_a \
  159.         and len (self.last_data) > 0 \
  160.         and self.found_entry \
  161.         and not self.found_misc:
  162.       if len (last_data) > 0 and last_data[0] == u"\u2013":
  163.         last_data = last_data[1:]
  164.       self.structure[-1][2][-1].append (last_data)
  165.       self.post_a = False
  166.     self.stack = self.stack[:-1]
  167.     if not self.post_a:
  168.       self.last_data = []
  169.     else:
  170.       self.last_data = [last_data]
  171.  
  172.   def handle_data (self, data):
  173.     self.last_data.append (data)
  174.  
  175. class VoidDomainChapterParser (HTMLParser):
  176.   def __init__ (self):
  177.     super ().__init__ ()
  178.     self.found_entry = False
  179.     self.jp_post_flair = False
  180.     self.last_data = []
  181.     self.stack = list ()
  182.     self.structure = Dummy ()
  183.     self.structure.t = 'top'
  184.     self.structure.first_child = None
  185.     self.structure.last_child = None
  186.     self.structure.parent = None
  187.     self.structure.next = None
  188.     self.structure_head = self.structure
  189.     self.structure.post_date = None
  190.     self.structure.mod_date = None
  191.     self.ul_depth = 0
  192.  
  193.   def break_paragraph (self):
  194.     c = self.structure_head
  195.     p = None
  196.     while c is not None:
  197.       if c.t == 'para':
  198.         p = c
  199.         break
  200.       c = c.parent
  201.  
  202.     if p is None:
  203.       return
  204.  
  205.     stack = list ()
  206.     c = self.structure_head
  207.     while c is not None:
  208.       stack.append (c)
  209.       if p is c:
  210.         break
  211.       c = c.parent
  212.  
  213.     for item in stack:
  214.       self.close_tag ()
  215.  
  216.     for item in reversed (stack):
  217.       self.new_child_copy (item)
  218.  
  219.   def new_child_tag (self, tag, attrs):
  220.     if tag == 'p':
  221.       self.new_child ('para')
  222.     elif tag == 'i' or tag == 'em':
  223.       self.new_child ('i')
  224.     elif tag == 'b' or tag == 'strong':
  225.       self.new_child ('str')
  226.     elif tag == "span" \
  227.         and (('style', 'font-size:xx-small;') in attrs \
  228.              or ('style', 'font-size:small;') in attrs):
  229.       self.new_child ('sub')
  230.     elif tag == "span" \
  231.         and ('style', 'font-variant:small-caps;') in attrs:
  232.       self.new_child ('code')
  233.     elif tag == 'a' and 'href' in [x[0] for x in attrs]:
  234.       self.new_child ('a', extra=next ((x for x in attrs if x[0] == 'href'))[1])
  235.     elif tag == 'del':
  236.       self.new_child ('del')
  237.     elif tag == 'ul':
  238.       if self.ul_depth == 0:
  239.         self.new_child ('para')
  240.       else:
  241.         self.new_child ('ul')
  242.       self.ul_depth += 1
  243.     else:
  244.       raise Exception("unhandled tag {} with attrs {}".format (tag, attrs))
  245.  
  246.   def handle_starttag (self, tag, attrs):
  247.     self.last_data = "".join (self.last_data)
  248.     self.stack.append ([tag, attrs])
  249.     if tag == "div" \
  250.         and not self.found_entry \
  251.         and ('class', 'entry') in attrs:
  252.       self.found_entry = True
  253.     elif self.found_entry \
  254.         and not self.jp_post_flair \
  255.         and tag == "div" \
  256.         and ('id', 'jp-post-flair') in attrs:
  257.       self.jp_post_flair = True
  258.     elif self.found_entry and not self.jp_post_flair:
  259.       if tag == 'br':
  260.         self.break_paragraph ()
  261.       elif tag == 'li':
  262.         self.break_paragraph ()
  263.         self.new_child ('data', '\u2022' * self.ul_depth + ' ')
  264.       else:
  265.         self.new_child_tag (tag, attrs)
  266.  
  267.     self.last_data = []
  268.  
  269.   def handle_endtag (self, tag):
  270.     last_data = "".join (self.last_data)
  271.     if self.found_entry and not self.jp_post_flair and tag != 'li':
  272.       if tag == 'ul':
  273.         self.ul_depth -= 1
  274.       self.close_tag ()
  275.     self.stack = self.stack[:-1]
  276.  
  277.   def handle_startendtag (self, tag, attrs):
  278.     last_data = "".join (self.last_data)
  279.     if tag == "meta" \
  280.         and ('property', 'article:published_time') in attrs:
  281.       self.structure.post_date = datetime.datetime.fromisoformat (next ((x for x in attrs if x[0] == 'content'))[1])
  282.     elif tag == "meta" \
  283.         and ('property', 'article:modified_time') in attrs:
  284.       self.structure.mod_date = datetime.datetime.fromisoformat (next ((x for x in attrs if x[0] == 'content'))[1])
  285.     if self.found_entry and not self.jp_post_flair:
  286.       if tag == 'br':
  287.         self.break_paragraph ()
  288.       else:
  289.         raise Exception("unhandled startend tag {} with attrs {}".format (tag, attrs))
  290.  
  291.   def handle_data (self, data):
  292.     self.last_data.append (data)
  293.     if not self.found_entry or self.jp_post_flair:
  294.       return
  295.     self.new_child ('data', data)
  296.  
  297.   def close_tag (self):
  298.     self.structure_head = self.structure_head.parent
  299.  
  300.   def new_child_copy (self, orig):
  301.     data = None
  302.     extra = None
  303.     if orig.t == 'data':
  304.       data = orig.data
  305.     if orig.t == 'a':
  306.       extra = orig.href
  307.     self.new_child (orig.t, data, extra)
  308.  
  309.   def new_child (self, t, data=None, extra=None):
  310.     if self.structure_head is None:
  311.       return
  312.     c = Dummy ()
  313.     c.t = t
  314.     c.first_child = None
  315.     c.last_child = None
  316.     c.parent = self.structure_head
  317.     c.next = None
  318.     if self.structure_head.last_child is not None:
  319.       self.structure_head.last_child.next = c
  320.     self.structure_head.last_child = c
  321.     if self.structure_head.first_child is None:
  322.       self.structure_head.first_child = c
  323.     if t != 'data':
  324.       self.structure_head = c
  325.     else:
  326.       c.data = data
  327.     if t == 'a':
  328.       c.href = extra
  329.  
  330. def parse_html (filename, parser):
  331.   with open (filename, 'rb') as fd:
  332.     buffer = []
  333.     while True:
  334.       b = fd.read (buffer_length)
  335.       if len (b) > 0:
  336.         buffer.append (b)
  337.       try:
  338.         text = b"".join (buffer).decode ('utf-8')
  339.       except:
  340.         pass
  341.       else:
  342.         buffer = []
  343.         parser.feed (text)
  344.       if len (b) <= 0:
  345.         break
  346.   return parser.structure
  347.  
  348. def pu (f, s):
  349.   f.write (s.encode ("utf-8"))
  350.  
  351. def output_fb2 (book, outdir):
  352.   title = "Void Domain, Book {:02d}".format (book.number)
  353.   filename = os.path.join (outdir, "{}.fb2".format (title))
  354.   print ("Writing {}".format (filename))
  355.   with open (filename, 'wb') as outfile:
  356.     pu (outfile, \
  357. """<?xml version="1.0" encoding="utf-8" ?>
  358. <FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:l="http://www.w3.org/1999/xlink">
  359. <description>
  360. <title-info>
  361. """)
  362.     pu (outfile, genre_xml)
  363.     pu (outfile, author_xml)
  364.     pu (outfile, \
  365. """<book-title>{}</book-title>
  366. """.format (escape (title)))
  367.     if book.number == 1:
  368.       pu (outfile, annotation_xml)
  369.     pu (outfile, \
  370. """<date value="{}">{}</date>
  371. """.format (book.post.strftime ("%Y-%m-%d"), book.post.strftime ("%Y")))
  372.     pu (outfile, misc_book_xml)
  373.     pu (outfile,
  374. """<sequence name="Void Domain" number="{}"/>
  375. """.format (book.number))
  376.     pu (outfile,
  377. """</title-info>
  378. <document-info>
  379. <author>
  380. """)
  381.     pu (outfile, doc_author_xml)
  382.     pu (outfile, \
  383. """</author>
  384. """)
  385.     pu (outfile, program_xml)
  386.     now = datetime.datetime.now ()
  387.     pu (outfile, \
  388. """<date value="{}">{}</date>
  389. """.format (now.strftime ("%Y-%m-%d"), now.strftime ("%d.%m.%Y")))
  390.     guid = uuid.uuid3 (namespace_uuid, title)
  391.     pu (outfile, \
  392. """<id>{}</id>
  393. """.format (guid))
  394.     pu (outfile, doc_version_xml)
  395.     pu (outfile, \
  396. """</document-info>
  397. <publish-info>
  398. <book-name>{}</book-name>
  399. {}<year>{}</year>
  400. </publish-info>
  401. </description>
  402. <body>
  403. """.format (escape (title), publisher_xml, book.mod.strftime ("%Y")))
  404.     pu (outfile, \
  405. """<title>
  406. <p>{}</p>
  407. </title>
  408. """.format (title))
  409.     for chapter in book.chapters:
  410.       pu (outfile, \
  411. """<section>
  412. <title>
  413. <p><strong>{}</strong></p>
  414. </title>
  415. """.format (escape (chapter.title)))
  416.       if chapter.subtitle is not None:
  417.         pu (outfile, \
  418. """<subtitle>
  419. <p><strong>{}</strong></p>
  420. </subtitle>
  421. """.format (escape (chapter.subtitle)))
  422.       output_tags (outfile, chapter.structure)
  423.       pu (outfile, \
  424. """</section>
  425. """)
  426.     pu (outfile, \
  427. """</body>
  428. </FictionBook>
  429. """)
  430.  
  431. def children_as_list (node):
  432.   c = node.first_child
  433.   children = list ()
  434.   while c is not None:
  435.     children.append (c)
  436.     c = c.next
  437.   return children
  438.  
  439.  
  440. def skip_paragraph (node):
  441.   children = children_as_list (node)
  442.   if len (children) >= 5 and \
  443.       children[0].t in ['a', 'del'] and \
  444.       children[1].t == 'data' and \
  445.       children[2].t == 'a' and \
  446.       children[3].t == 'data' and \
  447.       children[4].t in ['a', 'del']:
  448.     if '|' in children[1].data and '|' in children[3].data:
  449.       return True
  450.  
  451.   return False
  452.  
  453. def skip_href (node):
  454.   children = children_as_list (node)
  455.   if len (children) >= 1 and \
  456.       children[0].t == 'data':
  457.     if '>>' in children[0].data and '<<' in children[0].data:
  458.       return True
  459.  
  460.   return False
  461.  
  462. def output_tags_recursive (outfile, node, inside_para):
  463.   if node.t == 'data':
  464.     pu (outfile, escape (node.data))
  465.     return
  466.  
  467.   if node.t == 'para':
  468.     if skip_paragraph (node):
  469.       return
  470.     if inside_para[0]:
  471.       raise Exception ("Nested <p> tags")
  472.     inside_para[0] = True
  473.     pu (outfile, "<p>")
  474.   elif node.t == 'str':
  475.     pu (outfile, "<strong>")
  476.   elif node.t == 'i':
  477.     pu (outfile, "<emphasis>")
  478.   elif node.t == 'del':
  479.     pu (outfile, "<strikethrough>")
  480.   elif node.t == 'sub':
  481.     pu (outfile, "<sub>")
  482.   elif node.t == 'code':
  483.     pu (outfile, "<code>")
  484.   elif node.t == 'a':
  485.     if skip_href (node):
  486.       return
  487.     pu (outfile, """<a href="{}">""".format (escape (node.href)))
  488.   elif node.t == 'ul':
  489.     pass
  490.   else:
  491.     raise Exception ("Unhandled node type {}".format (node.t))
  492.  
  493.   c = node.first_child
  494.   while c is not None:
  495.     output_tags_recursive (outfile, c, inside_para)
  496.     c = c.next
  497.  
  498.   if node.t == 'para':
  499.     pu (outfile, "</p>")
  500.     inside_para[0] = False
  501.   elif node.t == 'str':
  502.     pu (outfile, "</strong>")
  503.   elif node.t == 'i':
  504.     pu (outfile, "</emphasis>")
  505.   elif node.t == 'del':
  506.     pu (outfile, "</strikethrough>")
  507.   elif node.t == 'sub':
  508.     pu (outfile, "</sub>")
  509.   elif node.t == 'code':
  510.     pu (outfile, "</code>")
  511.   elif node.t == 'a':
  512.     pu (outfile, "</a>")
  513.  
  514. def output_tags (outfile, top):
  515.    c = top.first_child
  516.    while c is not None:
  517.      p = list ()
  518.      p.append (False)
  519.      output_tags_recursive (outfile, c, p)
  520.      c = c.next
  521.  
  522. def main ():
  523.   batch = False
  524.   try:
  525.     parser = argparse.ArgumentParser ()
  526.     parser.add_argument ("-w", "--work-dir", default=".",
  527.                          help="work directory for temporary files (default is %(default)s)")
  528.     parser.add_argument ("-f", "--force", action='store_true', default=False,
  529.                          help="re-download the HTML files even if they are already downloaded (default is %(default)s)")
  530.     parser.add_argument ("-b", "--batch-mode", action='store_true', default=False,
  531.                          help="no user interaction (default is %(default)s)")
  532.     parser.add_argument ("-o", "--output-dir", metavar="OUTDIR", default=".",
  533.                          help="the directory to write FB2 files into (default is %(default)s)")
  534.     args = parser.parse_args ()
  535.     batch = args.batch_mode
  536.     index_filename = os.path.join (args.work_dir, "index.html")
  537.     fetch_page (index_filename, index_url, args.force)
  538.     structure = parse_html (index_filename, VoidDomainIndexParser ())
  539.     books = list ()
  540.     for book_rec in structure:
  541.       chapter_recs = book_rec[2]
  542.       book = Dummy ()
  543.       book.number = book_rec[1]
  544.       book.chapters = list ()
  545.       books.append (book)
  546.       for chap_rec in chapter_recs:
  547.         chapter = Dummy ()
  548.         chapter.url = chap_rec[1]
  549.         chapter.title = chap_rec[2]
  550.         chapter.subtitle = chap_rec[3] if chap_rec[3] else None
  551.         filename = "book-{}-chapter-{}.html".format (book.number, chapter.title)
  552.         filename = os.path.join (args.work_dir, filename)
  553.         chapter.filename = filename
  554.         fetch_page (filename, chapter.url, args.force)
  555.         chapter.structure = parse_html (filename, VoidDomainChapterParser ())
  556.         book.chapters.append (chapter)
  557.       book.post = None
  558.       book.mod = None
  559.       for chapter in book.chapters:
  560.         if chapter.structure.post_date is not None and (book.post is None or chapter.structure.post_date > book.post):
  561.           book.post = chapter.structure.post_date
  562.         if chapter.structure.mod_date is not None and (book.mod is None or chapter.structure.mod_date > book.mod):
  563.           book.mod = chapter.structure.mod_date
  564.       output_fb2 (book, args.output_dir)
  565.   except Exception:
  566.     traceback.print_exc (file=sys.stderr)
  567.   finally:
  568.     sys.stderr.flush ()
  569.     sys.stdout.flush ()
  570.     if not batch:
  571.       input ("Press Enter to end the program")
  572.  
  573.   sys.exit (0)
  574.  
  575. if __name__ == '__main__':
  576.   main ()
  577.  

Replies to Re: Re: Re: Untitled rss

Title Name Language When
Re: Re: Re: Re: Untitled Ivory Guinea Pig python 3 Weeks ago.