fix: fix broken answers feed

this commit fixes the issue of answers not being shown as the underlying HTML changed which broke
old scraping logic

BREAKING CHANGE: older versions will not work with answer route

fix https://github.com/zyachel/quetre/issues/96
This commit is contained in:
zyachel 2023-03-05 00:34:53 +05:30
parent 9d195c2d33
commit 677b846eb2
2 changed files with 119 additions and 61 deletions

View file

@ -0,0 +1,76 @@
/* eslint-disable no-useless-catch */
////////////////////////////////////////////////////////
// IMPORTS
////////////////////////////////////////////////////////
import * as cheerio from 'cheerio';
import getAxiosInstance from '../utils/getAxiosInstance.js';
import AppError from '../utils/AppError.js';
////////////////////////////////////////////////////////
// FUNCTION
////////////////////////////////////////////////////////
/**
* makes a call to quora.com(with the resourceStr appended) and returns parsed JSON containing the data about the resource requested.
* @param {string} resourceStr a string after the baseURL
* @param {string} lang additional options
* @returns JSON containing the result
*/
const answersFetcher = async (resourceStr, lang) => {
try {
const axiosInstance = getAxiosInstance(lang);
const res = await axiosInstance.get(encodeURIComponent(resourceStr));
const $ = cheerio.load(res.data);
const rawData = { question: null, answers: [], related: [], answerCount: 0 };
// there are about 9-10 script tags containing data we need
$('body script').each((i, el) => {
const text = $(el).html();
const matches = text.match(/\.push\((".*")\);/); // data is contained like: someProp.push("<data>");
if (!matches) return;
// brittle logic, but works
const matchedPart = JSON.parse(JSON.parse(matches[1])).data;
// only question block has this word
if (typeof matchedPart.question?.viewerHasAnswered !== 'undefined') {
rawData.question = matchedPart.question;
// primary answer block
} else if (matchedPart.question?.answers?.edges) {
rawData.answers.push(matchedPart.question.answers.edges[0].node.answer);
// other answer blocks
} else if (
// eslint-disable-next-line no-underscore-dangle
matchedPart.node?.__typename === 'QuestionRelevantAnswerItem2'
) {
rawData.answers.push(matchedPart.node.answer);
// related questions block contains both answer count and related questions
} else if (matchedPart.bottomRelatedQuestionsInfo) {
rawData.related = matchedPart.bottomRelatedQuestionsInfo.relatedQuestions;
rawData.answerCount = matchedPart.answerCount;
}
});
if (!rawData.question) throw new AppError("couldn't retrieve data", 500);
return rawData;
} catch (err) {
const statusCode = err.response?.status;
if (statusCode === 404) throw new AppError('Not found', 404);
else if (statusCode === 429 || statusCode === 403)
throw new AppError(
'Quora is rate limiting this instance. Try another or host your own.',
503
);
else throw err;
}
};
////////////////////////////////////////////////////////
// EXPORTS
////////////////////////////////////////////////////////
export default answersFetcher;

View file

@ -1,87 +1,69 @@
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// IMPORTS // IMPORTS
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// import log from '../utils/log.js';
import AppError from '../utils/AppError.js';
import { quetrefy } from '../utils/urlModifiers.js'; import { quetrefy } from '../utils/urlModifiers.js';
import fetcher from './fetcher.js'; import answersFetcher from './answersFetcher.js';
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// FUNCTION // FUNCTION
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
const KEYWORD = 'question';
const getAnswers = async (slug, lang) => { const getAnswers = async (slug, lang) => {
// getting data and destructuring it in case it exists // getting data and destructuring it in case it exists
const res = await fetcher(slug, { keyword: KEYWORD, lang }); const rawData = await answersFetcher(slug, lang);
const {
data: { [KEYWORD]: rawData },
} = JSON.parse(res);
if (!rawData)
throw new AppError(
"Answers couldn't be fetched. Recheck the URL, or resend the request if you believe the URL is correct.",
404
);
// array containing all the answers with metadata // array containing all the answers with metadata
const ansArr = rawData.pagedListDataConnection.edges const ansArr = rawData.answers.map(answer => ({
.filter(ansObj => ansObj.node.answer !== undefined) text: JSON.parse(answer.content).sections,
.map(ansObj => ({ isViewable: !!answer.viewerHasAccess,
text: JSON.parse(ansObj.node.answer.content).sections, creationTime: answer.creationTime,
isViewable: !!ansObj.node.answer.viewerHasAccess, updatedTime: answer.updatedTime,
creationTime: ansObj.node.answer.creationTime, numComments: answer.numDisplayComments,
updatedTime: ansObj.node.answer.updatedTime, numUpvotes: answer.numUpvotes,
numComments: ansObj.node.answer.numDisplayComments, numViews: answer.numViews,
numUpvotes: ansObj.node.answer.numUpvotes, numShares: answer.numShares,
numViews: ansObj.node.answer.numViews, numAnswerRequests: answer.numRequesters,
numShares: ansObj.node.answer.numSharers, aid: answer.aid,
numAnswerRequests: ansObj.node.answer.numRequesters, isBusinessAnswer: answer.businessAnswer,
aid: ansObj.node.answer.aid,
isBusinessAnswer: ansObj.node.answer.businessAnswer,
author: { author: {
uid: ansObj.node.answer.author.uid, uid: answer.author.uid,
isAnon: ansObj.node.answer.author.isAnon, isAnon: answer.author.isAnon,
image: ansObj.node.answer.author.profileImageUrl, image: answer.author.profileImageUrl,
isVerified: ansObj.node.answer.author.isVerified, isVerified: answer.author.isVerified,
url: quetrefy(ansObj.node.answer.author.profileUrl), url: quetrefy(answer.author.profileUrl),
name: `${ansObj.node.answer.author.names[0].givenName} ${ansObj.node.answer.author.names[0].familyName}`, name: `${answer.author.names[0].givenName} ${answer.author.names[0].familyName}`,
credential: ansObj.node.answer.authorCredential?.translatedString, credential: answer.authorCredential?.translatedString,
// additionalCredentials: ansObj.node.answer?.credibilityFacts.map(), // additionalCredentials: answer.node.answer?.credibilityFacts.map(),
}, },
originalQuestion: { originalQuestion: {
text: JSON.parse(ansObj.node.answer.question.title).sections, text: JSON.parse(answer.question.title).sections,
url: quetrefy(ansObj.node.answer.question.url), url: quetrefy(answer.question.url),
qid: ansObj.node.answer.question.qid, qid: answer.question.qid,
isDeleted: ansObj.node.answer.question.isDeleted, isDeleted: answer.question.isDeleted,
}, },
})); }));
// main data object to be returned // main data object to be returned
const data = { const data = {
question: { question: {
text: JSON.parse(rawData.title).sections, text: JSON.parse(rawData.question.title).sections,
url: quetrefy(rawData.url), url: quetrefy(rawData.question.url),
qid: rawData.qid, qid: rawData.question.qid,
idDeleted: rawData.isDeleted, idDeleted: rawData.question.isDeleted,
isViewable: rawData.isVisibleToViewer, isViewable: rawData.question.isVisibleToViewer,
askerUid: rawData.asker.uid, askerUid: rawData.question.asker.uid,
}, },
numAnswers: rawData.answerCount, numAnswers: rawData.answerCount,
answers: ansArr, answers: ansArr,
topics: rawData.topics.map(topicObj => ({ topics: rawData.question.topics.map(topicObj => ({
tid: topicObj.tid, tid: topicObj.tid,
name: topicObj.name, name: topicObj.name,
url: quetrefy(topicObj.url), url: quetrefy(topicObj.url),
})), })),
relatedQuestions: rawData.bottomRelatedQuestionsInfo.relatedQuestions.map( relatedQuestions: rawData.related.map(questionObj => ({
questionObj => ({
qid: questionObj.qid, qid: questionObj.qid,
url: quetrefy(questionObj.url), url: quetrefy(questionObj.url),
text: JSON.parse(questionObj.title).sections, text: JSON.parse(questionObj.title).sections,
}) })),
),
}; };
return data; return data;