fix: fix broken answers feed

this commit fixes the issue of answers not being shown as the underlying HTML changed which broke
old scraping logic

BREAKING CHANGE: older versions will not work with answer route

fix https://github.com/zyachel/quetre/issues/96
This commit is contained in:
zyachel 2023-03-05 00:34:53 +05:30
parent 9d195c2d33
commit 677b846eb2
2 changed files with 119 additions and 61 deletions

View file

@ -0,0 +1,76 @@
/* eslint-disable no-useless-catch */
////////////////////////////////////////////////////////
// IMPORTS
////////////////////////////////////////////////////////
import * as cheerio from 'cheerio';
import getAxiosInstance from '../utils/getAxiosInstance.js';
import AppError from '../utils/AppError.js';
////////////////////////////////////////////////////////
// FUNCTION
////////////////////////////////////////////////////////
/**
* makes a call to quora.com(with the resourceStr appended) and returns parsed JSON containing the data about the resource requested.
* @param {string} resourceStr a string after the baseURL
* @param {string} lang additional options
* @returns JSON containing the result
*/
const answersFetcher = async (resourceStr, lang) => {
try {
const axiosInstance = getAxiosInstance(lang);
const res = await axiosInstance.get(encodeURIComponent(resourceStr));
const $ = cheerio.load(res.data);
const rawData = { question: null, answers: [], related: [], answerCount: 0 };
// there are about 9-10 script tags containing data we need
$('body script').each((i, el) => {
const text = $(el).html();
const matches = text.match(/\.push\((".*")\);/); // data is contained like: someProp.push("<data>");
if (!matches) return;
// brittle logic, but works
const matchedPart = JSON.parse(JSON.parse(matches[1])).data;
// only question block has this word
if (typeof matchedPart.question?.viewerHasAnswered !== 'undefined') {
rawData.question = matchedPart.question;
// primary answer block
} else if (matchedPart.question?.answers?.edges) {
rawData.answers.push(matchedPart.question.answers.edges[0].node.answer);
// other answer blocks
} else if (
// eslint-disable-next-line no-underscore-dangle
matchedPart.node?.__typename === 'QuestionRelevantAnswerItem2'
) {
rawData.answers.push(matchedPart.node.answer);
// related questions block contains both answer count and related questions
} else if (matchedPart.bottomRelatedQuestionsInfo) {
rawData.related = matchedPart.bottomRelatedQuestionsInfo.relatedQuestions;
rawData.answerCount = matchedPart.answerCount;
}
});
if (!rawData.question) throw new AppError("couldn't retrieve data", 500);
return rawData;
} catch (err) {
const statusCode = err.response?.status;
if (statusCode === 404) throw new AppError('Not found', 404);
else if (statusCode === 429 || statusCode === 403)
throw new AppError(
'Quora is rate limiting this instance. Try another or host your own.',
503
);
else throw err;
}
};
////////////////////////////////////////////////////////
// EXPORTS
////////////////////////////////////////////////////////
export default answersFetcher;

View file

@ -1,87 +1,69 @@
////////////////////////////////////////////////////////
// IMPORTS
////////////////////////////////////////////////////////
// import log from '../utils/log.js';
import AppError from '../utils/AppError.js';
import { quetrefy } from '../utils/urlModifiers.js';
import fetcher from './fetcher.js';
import answersFetcher from './answersFetcher.js';
////////////////////////////////////////////////////////
// FUNCTION
////////////////////////////////////////////////////////
const KEYWORD = 'question';
const getAnswers = async (slug, lang) => {
// getting data and destructuring it in case it exists
const res = await fetcher(slug, { keyword: KEYWORD, lang });
const {
data: { [KEYWORD]: rawData },
} = JSON.parse(res);
if (!rawData)
throw new AppError(
"Answers couldn't be fetched. Recheck the URL, or resend the request if you believe the URL is correct.",
404
);
const rawData = await answersFetcher(slug, lang);
// array containing all the answers with metadata
const ansArr = rawData.pagedListDataConnection.edges
.filter(ansObj => ansObj.node.answer !== undefined)
.map(ansObj => ({
text: JSON.parse(ansObj.node.answer.content).sections,
isViewable: !!ansObj.node.answer.viewerHasAccess,
creationTime: ansObj.node.answer.creationTime,
updatedTime: ansObj.node.answer.updatedTime,
numComments: ansObj.node.answer.numDisplayComments,
numUpvotes: ansObj.node.answer.numUpvotes,
numViews: ansObj.node.answer.numViews,
numShares: ansObj.node.answer.numSharers,
numAnswerRequests: ansObj.node.answer.numRequesters,
aid: ansObj.node.answer.aid,
isBusinessAnswer: ansObj.node.answer.businessAnswer,
author: {
uid: ansObj.node.answer.author.uid,
isAnon: ansObj.node.answer.author.isAnon,
image: ansObj.node.answer.author.profileImageUrl,
isVerified: ansObj.node.answer.author.isVerified,
url: quetrefy(ansObj.node.answer.author.profileUrl),
name: `${ansObj.node.answer.author.names[0].givenName} ${ansObj.node.answer.author.names[0].familyName}`,
credential: ansObj.node.answer.authorCredential?.translatedString,
// additionalCredentials: ansObj.node.answer?.credibilityFacts.map(),
},
originalQuestion: {
text: JSON.parse(ansObj.node.answer.question.title).sections,
url: quetrefy(ansObj.node.answer.question.url),
qid: ansObj.node.answer.question.qid,
isDeleted: ansObj.node.answer.question.isDeleted,
},
}));
const ansArr = rawData.answers.map(answer => ({
text: JSON.parse(answer.content).sections,
isViewable: !!answer.viewerHasAccess,
creationTime: answer.creationTime,
updatedTime: answer.updatedTime,
numComments: answer.numDisplayComments,
numUpvotes: answer.numUpvotes,
numViews: answer.numViews,
numShares: answer.numShares,
numAnswerRequests: answer.numRequesters,
aid: answer.aid,
isBusinessAnswer: answer.businessAnswer,
author: {
uid: answer.author.uid,
isAnon: answer.author.isAnon,
image: answer.author.profileImageUrl,
isVerified: answer.author.isVerified,
url: quetrefy(answer.author.profileUrl),
name: `${answer.author.names[0].givenName} ${answer.author.names[0].familyName}`,
credential: answer.authorCredential?.translatedString,
// additionalCredentials: answer.node.answer?.credibilityFacts.map(),
},
originalQuestion: {
text: JSON.parse(answer.question.title).sections,
url: quetrefy(answer.question.url),
qid: answer.question.qid,
isDeleted: answer.question.isDeleted,
},
}));
// main data object to be returned
const data = {
question: {
text: JSON.parse(rawData.title).sections,
url: quetrefy(rawData.url),
qid: rawData.qid,
idDeleted: rawData.isDeleted,
isViewable: rawData.isVisibleToViewer,
askerUid: rawData.asker.uid,
text: JSON.parse(rawData.question.title).sections,
url: quetrefy(rawData.question.url),
qid: rawData.question.qid,
idDeleted: rawData.question.isDeleted,
isViewable: rawData.question.isVisibleToViewer,
askerUid: rawData.question.asker.uid,
},
numAnswers: rawData.answerCount,
answers: ansArr,
topics: rawData.topics.map(topicObj => ({
topics: rawData.question.topics.map(topicObj => ({
tid: topicObj.tid,
name: topicObj.name,
url: quetrefy(topicObj.url),
})),
relatedQuestions: rawData.bottomRelatedQuestionsInfo.relatedQuestions.map(
questionObj => ({
qid: questionObj.qid,
url: quetrefy(questionObj.url),
text: JSON.parse(questionObj.title).sections,
})
),
relatedQuestions: rawData.related.map(questionObj => ({
qid: questionObj.qid,
url: quetrefy(questionObj.url),
text: JSON.parse(questionObj.title).sections,
})),
};
return data;