Finding good SEO tools on Google is like finding non-lemon cars at a used-car dealership in an old strip mall.
There are lots of expensive SEO products selling information on backlinks and keywords, but there are few working solutions for just building a CSV or spreadsheet of path/title/h1/meta tags for a given website.
Writing a script yourself, however, is not hard. Here's the end result:
I used a fancy library which uses Puppeteer which uses Headless Chrome. It includes a crawler that obeys robots.txt and can interpret sitemaps.
touch index.js
$ yarn add headless-chrome-crawler
index.js
SCRAPE_URL
.node index.js
const SCRAPE_URL = "https://www.example.com/";
const MAX_DEPTH = 5; // depth of the followed links
const FILE = "./result.csv";
const url = require("url");
const HCCrawler = require("headless-chrome-crawler");
const CSVExporter = require("headless-chrome-crawler/exporter/csv");
const exporter = new CSVExporter({
file: FILE,
fields: [
"result.path",
"result.title",
"result.h1",
"result.metaDescription",
"result.ogTitle",
"result.ogDescription",
"result.ogImage"
]
});
(async () => {
const crawler = await HCCrawler.launch({
// Function to be evaluated in browsers
evaluatePage: () => {
const firstAndTrimText = el =>
el &&
el.text() &&
el
.first()
.text()
.trim()
.replace(/\s+/g, " ");
const firstAndTrimContent = el =>
el &&
el.attr("content") &&
el
.first()
.attr("content")
.trim();
return {
path: window.location.href
.replace(window.location.host, "")
.replace(/https?:\/\//, ""),
title: firstAndTrimText($("title")),
h1: firstAndTrimText($("h1")),
metaDescription: firstAndTrimContent($("meta[name='description']")),
ogTitle: firstAndTrimContent($("meta[property='og:title']")),
ogDescription: firstAndTrimContent(
$("meta[property='og:description']")
),
ogImage: firstAndTrimContent($("meta[property='og:image']"))
};
},
maxConcurrency: 1,
exporter,
onSuccess: result => {
console.log(result.response.url);
},
onError: err => {
console.error(err);
}
});
// Queue a request
await crawler.queue({
url: SCRAPE_URL,
maxDepth: MAX_DEPTH,
allowedDomains: [url.parse(SCRAPE_URL).host],
skipDuplicates: true
});
await crawler.onIdle(); // Resolved when no queue is left
await crawler.close(); // Close the crawler
})();
Now you have result.csv
which can be imported into most spreadsheet software.