scramjet/src/shared/rewriters/html.ts
2024-10-12 15:00:03 -04:00

285 lines
6.7 KiB
TypeScript

import { ElementType, Parser } from "htmlparser2";
import { ChildNode, DomHandler, Element } from "domhandler";
import render from "dom-serializer";
import { URLMeta, rewriteUrl } from "./url";
import { rewriteCss } from "./css";
import { rewriteJs } from "./js";
import { CookieStore } from "../cookie";
export function rewriteHtml(
html: string,
cookieStore: CookieStore,
meta: URLMeta,
fromTop: boolean = false
) {
const handler = new DomHandler((err, dom) => dom);
const parser = new Parser(handler);
parser.write(html);
parser.end();
traverseParsedHtml(handler.root, cookieStore, meta);
function findhead(node) {
if (node.type === ElementType.Tag && node.name === "head") {
return node as Element;
} else if (node.childNodes) {
for (const child of node.childNodes) {
const head = findhead(child);
if (head) return head;
}
}
return null;
}
if (fromTop) {
let head = findhead(handler.root);
if (!head) {
head = new Element("head", {}, []);
handler.root.children.unshift(head);
}
const dump = JSON.stringify(cookieStore.dump());
const injected = `
self.COOKIE = ${dump};
self.$scramjet.config = ${JSON.stringify(self.$scramjet.config)};
self.$scramjet.codec = self.$scramjet.codecs[self.$scramjet.config.codec];
if ("document" in self && document.currentScript) {
document.currentScript.remove();
}
`;
const script = (src) => new Element("script", { src });
head.children.unshift(
script(self.$scramjet.config["wasm"]),
script(self.$scramjet.config["codecs"]),
script("data:application/javascript;base64," + btoa(injected)),
script(self.$scramjet.config["shared"]),
script(self.$scramjet.config["client"])
);
}
return render(handler.root);
}
// type ParseState = {
// base: string;
// origin?: URL;
// };
export function unrewriteHtml(html: string) {
const handler = new DomHandler((err, dom) => dom);
const parser = new Parser(handler);
parser.write(html);
parser.end();
function traverse(node: ChildNode) {
if ("attribs" in node) {
for (const key in node.attribs) {
if (key == "data-scramjet-script-source-src") {
if (node.children[0] && "data" in node.children[0])
node.children[0].data = atob(node.attribs[key]);
continue;
}
if (key.startsWith("data-scramjet-")) {
node.attribs[key.slice(13)] = node.attribs[key];
delete node.attribs[key];
}
}
}
if ("childNodes" in node) {
for (const child of node.childNodes) {
traverse(child);
}
}
}
traverse(handler.root);
return render(handler.root);
}
export const htmlRules: {
[key: string]: "*" | string[] | ((...any: any[]) => string | null);
fn: (value: string, meta: URLMeta, cookieStore: CookieStore) => string | null;
}[] = [
{
fn: (value: string, meta: URLMeta) => {
return rewriteUrl(value, meta);
},
// url rewrites
src: [
"embed",
"script",
"img",
"image",
"iframe",
"source",
"video",
"audio",
"input",
"track",
],
href: ["a", "link", "area"],
data: ["object"],
action: ["form"],
formaction: ["button", "input", "textarea", "submit"],
poster: ["video"],
"xlink:href": ["image"],
},
{
fn: () => null,
// csp stuff that must be deleted
nonce: "*",
integrity: ["script", "link"],
csp: ["iframe"],
},
{
fn: (value: string, meta: URLMeta) => rewriteSrcset(value, meta),
// srcset
srcset: ["img", "source"],
imagesrcset: ["link"],
},
{
fn: (value: string, meta: URLMeta, cookieStore: CookieStore) =>
rewriteHtml(
value,
cookieStore,
{
// for srcdoc origin is the origin of the page that the iframe is on. base and path get dropped
origin: new URL(meta.origin.origin),
base: new URL(meta.origin.origin),
},
true
),
// srcdoc
srcdoc: ["iframe"],
},
{
fn: (value: string, meta: URLMeta) => rewriteCss(value, meta),
style: "*",
},
{
fn: (value: string) => {
if (["_parent", "_top", "_unfencedTop"].includes(value)) return "_self";
},
target: ["a", "base"],
},
];
// i need to add the attributes in during rewriting
function traverseParsedHtml(
node: any,
cookieStore: CookieStore,
meta: URLMeta
) {
if (node.name === "base" && node.attribs.href !== undefined) {
meta.base = new URL(node.attribs.href, meta.origin);
}
if (node.attribs)
for (const rule of htmlRules) {
for (const attr in rule) {
const sel = rule[attr];
if (typeof sel === "function") continue;
if (sel === "*" || sel.includes(node.name)) {
if (node.attribs[attr] !== undefined) {
const value = node.attribs[attr];
const v = rule.fn(value, meta, cookieStore);
if (v === null) delete node.attribs[attr];
else {
node.attribs[attr] = v;
}
node.attribs[`data-scramjet-${attr}`] = value;
}
}
}
}
if (node.name === "style" && node.children[0] !== undefined)
node.children[0].data = rewriteCss(node.children[0].data, meta);
if (
node.name === "script" &&
/(application|text)\/javascript|module|importmap|undefined/.test(
node.attribs.type
) &&
node.children[0] !== undefined
) {
let js = node.children[0].data;
// node.attribs[`data-scramjet-script-source-src`] = btoa(js);
node.attribs["data-scramjet-script-source-src"] = bytesToBase64(
new TextEncoder().encode(js)
);
const htmlcomment = /<!--[\s\S]*?-->/g;
js = js.replace(htmlcomment, "");
node.children[0].data = rewriteJs(js, meta);
}
if (node.name === "meta" && node.attribs["http-equiv"] != undefined) {
if (
node.attribs["http-equiv"].toLowerCase() === "content-security-policy"
) {
node = {};
} else if (
node.attribs["http-equiv"] === "refresh" &&
node.attribs.content.includes("url")
) {
const contentArray = node.attribs.content.split("url=");
if (contentArray[1])
contentArray[1] = rewriteUrl(contentArray[1].trim(), meta);
node.attribs.content = contentArray.join("url=");
}
}
if (node.childNodes) {
for (const childNode in node.childNodes) {
node.childNodes[childNode] = traverseParsedHtml(
node.childNodes[childNode],
cookieStore,
meta
);
}
}
return node;
}
export function rewriteSrcset(srcset: string, meta: URLMeta) {
const urls = srcset.split(/ [0-9]+x,? ?/g);
if (!urls) return "";
const sufixes = srcset.match(/ [0-9]+x,? ?/g);
if (!sufixes) return "";
const rewrittenUrls = urls.map((url, i) => {
if (url && sufixes[i]) {
return rewriteUrl(url, meta) + sufixes[i];
}
});
return rewrittenUrls.join("");
}
function _base64ToBytes(base64) {
const binString = atob(base64);
return Uint8Array.from(binString, (m) => m.codePointAt(0));
}
function bytesToBase64(bytes: Uint8Array) {
const binString = Array.from(bytes, (byte) =>
String.fromCodePoint(byte)
).join("");
return btoa(binString);
}