Extract top 10 links from Hacker News homepage
// TASK: hackerNewsHome
// Run this task with:
// forge task:run cloudflare:hackerNewsHome
import axios from 'axios';
import { createTask } from '@forgehive/task'
import { Schema } from '@forgehive/schema'
import { NewsModel } from '../../models/index.js'
const name = 'cloudflare:hackerNewsHome'
const description = 'Extract top 30 links from Hacker News homepage'
const schema = new Schema({})
interface ElementResult {
selector: string;
results: Array<{
text?: string;
attributes?: Array<{
name: string;
value: string;
}>;
}>;
}
interface HackerNewsItem {
title: string;
link: string;
commentLink: string;
}
interface ElementAttributes {
name: string;
value: string;
}
interface ElementResultItem {
text?: string;
attributes?: ElementAttributes[];
}
const boundaries = {
scrapeHackerNews: async () => {
const apiToken = process.env.CLOUDFLARE_API_TOKEN
const accountId = process.env.CLOUDFLARE_ACCOUNT_ID
if (!apiToken || !accountId) {
throw new Error('Missing Cloudflare credentials')
}
try {
const response = await axios.post(
`https://api.cloudflare.com/client/v4/accounts/${accountId}/browser-rendering/scrape`,
{
url: 'https://news.ycombinator.com',
elements: [
{
selector: "tr.athing"
},
{
selector: "span.titleline > a"
},
{
selector: "span.subline a[href*='item?id=']"
},
{
selector: "span.subline a[href*='user?id=']"
},
{
selector: "span.rank"
}
]
},
{
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${apiToken}`
}
}
)
return response.data
} catch (error: unknown) {
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
throw new Error(`Failed to scrape Hacker News: ${errorMessage}`)
}
},
saveNewsItem: async (newsItem: { title: string; url: string; submitter: string; hackerNewsId: string; rank: number }) => {
try {
const now = new Date();
// Check if item already exists by hackerNewsId
const existingItem = await NewsModel.findOne({ hackerNewsId: newsItem.hackerNewsId });
if (existingItem) {
// Always update lastSeen when we encounter the item
const updateFields: any = { lastSeen: now };
// Check if rank has changed
if (existingItem.rank !== newsItem.rank) {
updateFields.rank = newsItem.rank;
updateFields.updatedAt = now;
await NewsModel.updateOne(
{ hackerNewsId: newsItem.hackerNewsId },
updateFields
);
console.log(`Updated rank for "${newsItem.title}": ${existingItem.rank} -> ${newsItem.rank}`);
return 'updated'; // Return 'updated' when rank changes
} else {
// Just update lastSeen (rank is the same)
await NewsModel.updateOne(
{ hackerNewsId: newsItem.hackerNewsId },
updateFields
);
console.log(`News item seen again with same rank: ${newsItem.title}`);
return 'skipped'; // Return 'skipped' for items with same rank
}
}
// Create new item (ingestedAt and lastSeen are the same for new items)
const savedItem = await NewsModel.create({
title: newsItem.title,
url: newsItem.url,
submitter: newsItem.submitter,
hackerNewsId: newsItem.hackerNewsId,
rank: newsItem.rank,
ingestedAt: now,
lastSeen: now
});
return 'saved'; // Return 'saved' for new items
} catch (error: unknown) {
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
throw new Error(`Failed to save news item: ${errorMessage}`);
}
}
}
export const hackerNewsHome = createTask({
name,
description,
schema,
boundaries,
fn: async function (argv, { scrapeHackerNews, saveNewsItem }) {
try {
// Scrape the Hacker News homepage
const scrapeResult = await scrapeHackerNews()
if (!scrapeResult || !scrapeResult.result) {
throw new Error('No scrape results found')
}
// Extract title links
const titleLinks: Array<{text: string, href: string}> = []
const titleResult = scrapeResult.result.find((item: ElementResult) => item.selector === "span.titleline > a")
if (titleResult && titleResult.results) {
titleResult.results.forEach((result: ElementResultItem) => {
if (result.text && result.attributes) {
const hrefAttr = result.attributes.find((attr: ElementAttributes) => attr.name === 'href')
if (hrefAttr) {
titleLinks.push({
text: result.text,
href: hrefAttr.value
})
}
}
})
}
// Extract comment links (skip time links, take only comment/discuss links)
const commentLinks: Array<string> = []
const commentResult = scrapeResult.result.find((item: ElementResult) => item.selector === "span.subline a[href*='item?id=']")
if (commentResult && commentResult.results) {
// Skip every other link (time links) and take only comment links
commentResult.results.forEach((result: ElementResultItem, index: number) => {
// Take every second link starting from index 1 (0=time, 1=comments, 2=time, 3=comments, etc.)
if (index % 2 === 1 && result.attributes) {
const hrefAttr = result.attributes.find((attr: ElementAttributes) => attr.name === 'href')
if (hrefAttr && hrefAttr.value.includes('item?id=')) {
commentLinks.push(`https://news.ycombinator.com/${hrefAttr.value}`)
}
}
})
}
// Extract submitters
const submitters: Array<string> = []
const submitterResult = scrapeResult.result.find((item: ElementResult) => item.selector === "span.subline a[href*='user?id=']")
if (submitterResult && submitterResult.results) {
submitterResult.results.forEach((result: ElementResultItem) => {
if (result.text) {
submitters.push(result.text)
}
})
}
// Extract ranks
const ranks: Array<number> = []
const rankResult = scrapeResult.result.find((item: ElementResult) => item.selector === "span.rank")
if (rankResult && rankResult.results) {
rankResult.results.forEach((result: ElementResultItem) => {
if (result.text) {
// Extract number from rank text (e.g., "2." -> 2)
const rankMatch = result.text.match(/(\d+)\./);
if (rankMatch) {
ranks.push(parseInt(rankMatch[1], 10))
}
}
})
}
// Create and save the top 30 items
const items: HackerNewsItem[] = []
const savedCount = { successful: 0, duplicates: 0, updates: 0, errors: 0 }
const maxItems = Math.min(30, titleLinks.length, commentLinks.length, submitters.length, ranks.length)
for (let i = 0; i < maxItems; i++) {
const titleLink = titleLinks[i]
const commentLink = commentLinks[i]
const submitter = submitters[i] || 'unknown'
const rank = ranks[i]
if (titleLink && commentLink && rank) {
// Extract Hacker News ID from comment link (e.g., "item?id=12345" -> "12345")
const idMatch = commentLink.match(/item\?id=(\d+)/);
const hackerNewsId = idMatch ? idMatch[1] : '';
if (!hackerNewsId) {
console.warn(`Could not extract Hacker News ID from: ${commentLink}`);
continue; // Skip items without valid ID
}
const newsItem = {
title: titleLink.text,
link: titleLink.href.startsWith('http') ? titleLink.href : `https://news.ycombinator.com/${titleLink.href}`,
commentLink: commentLink
}
// Save to database with hackerNewsId and rank
try {
const result = await saveNewsItem({
title: newsItem.title,
url: newsItem.link,
submitter: submitter,
hackerNewsId: hackerNewsId,
rank: rank
})
if (result === 'saved') {
savedCount.successful++
} else if (result === 'skipped') {
savedCount.duplicates++
} else if (result === 'updated') {
savedCount.updates++
}
} catch (error) {
console.error(`Failed to save news item "${newsItem.title}":`, error)
savedCount.errors++
}
items.push(newsItem)
}
}
return {
success: true,
items: items,
message: `Successfully extracted ${items.length} items from Hacker News homepage. Saved: ${savedCount.successful} new, ${savedCount.updates} updated, ${savedCount.duplicates} duplicates, ${savedCount.errors} errors.`
}
} catch (error) {
const errorMessage = error instanceof Error ? error.message : 'Unknown error';
throw new Error(`Failed to process Hacker News data: ${errorMessage}`)
}
}
})