~ecs/ecs.d2evs.net

b6d2d1d17715b79e37c4b302d345967f4b2ab158 — Eyal Sawady 3 months ago da9b5b2
web.sh: initial commit
2 files changed, 143 insertions(+), 0 deletions(-)

A proxy/web.sh
A proxy/web2gmi.js
A proxy/web.sh => proxy/web.sh +11 -0
@@ 0,0 1,11 @@
#!/bin/sh
if [ "$QUERY_STRING" = "" ]
then
	printf "10 Enter a web url\r\n"
	return
fi

printf "20 text/gemini\r\n"
printf "NOTICE: This page was automatically generated from HTML. Your mileage may vary.\n\n"
node --input-type=module - <web2gmi.js "$QUERY_STRING"
printf "---\n\n=> %s View original (http)" "$QUERY_STRING"

A proxy/web2gmi.js => proxy/web2gmi.js +132 -0
@@ 0,0 1,132 @@
import { Readability } from '@mozilla/readability';
import jsdom from 'jsdom';
const JSDOM = jsdom.JSDOM;
import fetch from 'node-fetch';
import sanitizer from 'sanitize-html';
import { collapseWhiteSpace } from 'collapse-white-space';
import entities from 'html-entities';

jsdom.defaultDocumentFeatures = {
  QuerySelector: true
};

const sanitize = html => entities.decodeEntity(sanitizer(collapseWhiteSpace(html), {
  allowedTags: [],
  allowedAttributes: {},
})).trim();
const sanitizePre = html => entities.decodeEntity(sanitizer(html, {
  allowedTags: [],
  allowedAttributes: {},
})).trim();

const convert = (dom, title) => {
  let output = "";
  const elements = [...dom.querySelectorAll(`
    .page h1, .page h2, .page h3, .page h4, .page h5,
    .page p,
    .page img,
    .page blockquote,
    .page pre,
    .page a,
    .page ul, .page ol
  `)];
  if (elements.length === 0) {
    return "Unable to process this URL\n";
  }
  if (elements[0].tagName.toLowerCase() !== "h1") {
    output += `# ${title}\n\n`;
  }

  const rewriteLink = href =>
    "?" + encodeURIComponent(href);

  const emitLink = link => {
    let desc = sanitize(link.innerHTML);
    if (desc === "") {
      const img = link.querySelector("img");
      if (img) {
        desc = `${img.alt ? " " + img.alt : ""}\n\n`;
      }
    }
    if (link.protocol === "http:" || link.protocol === "https:") {
      output += `=> ${rewriteLink(link.href)} ${desc}\n`;
    } else {
      output += `=> ${link.href} ${desc}\n`;
    }
  };

  let visited = [];
  for (let i = 0; i < elements.length; i++) {
    let el = elements[i];
    if (visited.filter(v => v.contains(el)).length !== 0) {
      continue;
    }
    visited.push(el);

    let links = [];
    switch (el.tagName.toLowerCase()) {
    case 'p':
        output += sanitize(el.innerHTML) + "\n\n";
        links = [...el.querySelectorAll("a")];
        links.map(emitLink);
        if (links.length !== 0) {
          output += "\n";
        }
        break;
    case 'a':
        emitLink(el);
        break;
    case 'ul':
    case 'ol':
        links = [...el.querySelectorAll("a")];
        [...el.children].map(item => {
          output += `* ${sanitize(item.innerHTML)}\n`;
        });
        output += "\n";

        if (links.length !== 0) {
          output += "\n";
          links.map(emitLink);
          output += "\n";
        }
        break;
    case 'h1':
        output += `# ${sanitize(el.innerHTML)}\n\n`
        break;
    case 'h2':
        output += `## ${sanitize(el.innerHTML)}\n\n`
        break;
    case 'h3':
    case 'h4':
    case 'h5':
        output += `### ${sanitize(el.innerHTML)}\n\n`
        break;
    case 'img':
        output += `=> ${el.src} ${el.alt ? el.alt : "(image)"}\n\n`
        break;
    case 'pre':
        output += "```\n";
        output += sanitizePre(el.innerHTML) + "\n\n";
        output += "```\n";
        break;
    case 'blockquote':
        output += `> ${sanitize(el.innerHTML)}\n\n`;
        break;
    }
  }
  return output.replace(/\n\n\n+/g, "\n\n").trim();
};

const args = process.argv.slice(2);
fetch(args[0])
  .then(res => res.text())
  .then(body => {
    const doc = new JSDOM(body, {
      url: args[0],
    });
    const reader = new Readability(doc.window.document);
    const article = reader.parse();
    const readable = new JSDOM(article.content, {url: args[0]});
    console.log(convert(readable.window.document, article.title));
  })
  .catch((e) => console.log("An error occured while fetching this page:", e));