downthemall/scripts/gather.ts
2019-09-10 12:44:56 +02:00

329 lines
7.9 KiB
TypeScript

"use strict";
// License: MIT
import { getTextLinks } from "../lib/textlinks";
import { runtime } from "../lib/browser";
const REG_CLEAN = /[\s\t\r\n\v]+/g;
const baseURL = function() {
const base = document.querySelector("base[href]");
let url;
if (base) {
try {
const burl = base.getAttribute("href");
if (burl) {
url = new URL(burl);
}
}
catch (ex) {
// ignore
}
}
if (!url) {
url = new URL(location.href);
}
url.hash = "";
return url;
}();
function makeURL(url: string) {
const rv = new URL(url, baseURL);
rv.hash = "";
return rv;
}
function sanitize(str: string | null | undefined) {
return str && str.replace(REG_CLEAN, " ").trim() || "";
}
function *extractDescriptionInternal(parent: Node): Iterable<string> {
for (const node of Array.from(parent.childNodes)) {
switch (node.nodeType) {
case Node.TEXT_NODE: {
const val = sanitize(node.textContent);
if (val) {
yield val;
}
break;
}
case Node.ELEMENT_NODE:
yield *extractDescriptionInternal(node);
break;
default:
break;
}
}
}
function extractDescription(el: HTMLElement) {
return Array.from(extractDescriptionInternal(el)).join(" ");
}
function urlToUsable(e: any, u: string) {
try {
const usable = decodeURIComponent(u);
if (usable !== u) {
e.usable = usable;
}
else {
e.usable = true;
}
}
catch (ex) {
// ignore
}
}
class Gatherer {
private: boolean;
textLinks: boolean;
selectionOnly: boolean;
selection: Selection | null;
schemes: Set<string>;
transferable: string[];
constructor(options: any) {
this.private = !!options.private;
this.textLinks = options.textLinks;
this.selectionOnly = options.selectionOnly;
this.selection = options.selectionOnly ? getSelection() : null;
this.schemes = new Set(options.schemes);
this.transferable = options.transferable;
this.collectLink = this.collectLink.bind(this);
this.collectImage = this.collectImage.bind(this);
this.collectMedia = this.collectMedia.bind(this);
Object.freeze(this);
}
collectLink(a: HTMLAnchorElement) {
try {
const item = this.makeItem(a.href, a);
if (!item) {
return item;
}
urlToUsable(item, item.url);
item.fileName = sanitize(a.getAttribute("download"));
item.description = extractDescription(a);
return item;
}
catch (ex) {
console.error("oopsed link", ex.toString(), ex);
}
return null;
}
*collectImageInternal(img: HTMLImageElement) {
try {
const src = img.currentSrc || img.src;
const item = this.makeItem(src, img);
if (item) {
item.fileName = "";
item.description = item.title;
yield item;
}
const {srcset} = img;
if (!srcset) {
return;
}
const imgs = srcset.split(",").flatMap(e => {
const idx = e.lastIndexOf(" ");
return (idx > 0 ? e.slice(0, idx) : e).trim();
});
for (const i of imgs) {
const item = this.makeItem(i, img);
if (item) {
item.fileName = "";
item.description = item.title;
yield item;
}
}
}
catch (ex) {
console.error("oops image", ex.toString(), ex.stack, ex);
}
}
collectImage(img: HTMLImageElement) {
return [...this.collectImageInternal(img)];
}
collectMediaInternal(title: string | undefined | null, el: HTMLMediaElement) {
try {
const src = el.currentSrc || el.getAttribute("src");
if (!src) {
return null;
}
const item = this.makeItem(src, el, title);
if (!item) {
return null;
}
item.fileName = "";
item.description = item.title;
return item;
}
catch (ex) {
console.error("Failed to get media from", el && el.outerHTML, ex);
}
return null;
}
collectMedia(el: HTMLMediaElement) {
try {
const item = this.collectMediaInternal(el.getAttribute("title"), el);
const rv = item ? [item] : [];
const title: string | undefined = item && item.title ||
el.getAttribute("title");
rv.push(...Array.from(el.querySelectorAll("source")).
map(this.collectMediaInternal.bind(this, title)));
return rv;
}
catch (ex) {
console.log("oopsed media", ex.toString(), ex);
}
return [];
}
*findTexts() {
let doc = document;
const {selection} = this;
if (this.selectionOnly && selection) {
let copy = document.createElement("div");
for (let i = 0; i < selection.rangeCount; ++i) {
const r = selection.getRangeAt(i);
copy.appendChild(r.cloneContents());
}
doc = document.implementation.createDocument(
"http://www.w3.org/1999/xhtml", "html", null);
copy = doc.adoptNode(copy);
doc.documentElement.appendChild(doc.adoptNode(copy));
}
const set = doc.evaluate(
"//*[not(ancestor-or-self::a) and " +
"not(ancestor-or-self::style) and " +
"not(ancestor-or-self::script)]/text()",
doc,
null,
XPathResult.ORDERED_NODE_ITERATOR_TYPE,
null
);
for (let r = set.iterateNext(); r; r = set.iterateNext()) {
const {textContent} = r;
if (textContent) {
yield textContent;
continue;
}
}
}
*findTextLinks() {
for (const text of this.findTexts()) {
yield *getTextLinks(text, true);
}
}
collectTextLinks() {
if (!this.textLinks) {
return [];
}
try {
return Array.from(this.findTextLinks()).
map(link => this.makeItem(link.href, link));
}
catch (ex) {
console.error("oopsed textlinks", ex.toString(), ex);
}
return [];
}
makeItem(surl: string, el: HTMLElement, title?: string | null): any {
if (!(el as any).fake && this.selectionOnly &&
(!this.selection || !this.selection.containsNode(el, true))) {
return null;
}
try {
const url = makeURL(surl);
if (!this.schemes.has(url.protocol)) {
return null;
}
title = sanitize(el.getAttribute("title") || title) ||
sanitize(el.getAttribute("alt"));
return {
url: url.href,
title,
private: this.private
};
}
catch (ex) {
console.error("failed to make", surl, ex.message);
return null;
}
}
makeUniqueItemsInternal(arr: any[], known: Map<string, any>, result: any[]) {
for (const e of arr) {
if (!e || !e.url) {
continue;
}
const other = known.get(e.url);
if (other) {
for (const p of this.transferable) {
if (!other[p] && e[p]) {
other[p] = e[p];
}
}
continue;
}
known.set(e.url, e);
result.push(e);
}
}
makeUniqueItems(...arrs: any[]) {
const known = new Map();
const result: any[] = [];
for (const arr of arrs) {
this.makeUniqueItemsInternal(arr, known, result);
}
return result;
}
}
function gather(msg: any, sender: any, callback: Function) {
try {
if (!msg || msg.type !== "DTA:gather" || !callback) {
return Promise.resolve(null);
}
const gatherer = new Gatherer(msg);
const result = {
baseURL: baseURL.href,
links: gatherer.makeUniqueItems(
Array.from(document.links).map(gatherer.collectLink),
gatherer.collectTextLinks()),
media: gatherer.makeUniqueItems(
Array.from(document.querySelectorAll("img")).
flatMap(gatherer.collectImage),
Array.from(document.querySelectorAll("video")).
flatMap(gatherer.collectMedia),
Array.from(document.querySelectorAll("audio")).
flatMap(gatherer.collectMedia),
),
};
urlToUsable(result, result.baseURL);
return Promise.resolve(result);
}
catch (ex) {
console.error(ex.toString(), ex.stack, ex);
return Promise.resolve(null);
}
}
runtime.onMessage.addListener(gather);