如何过滤来自机器人的元预览请求以缩短 URL?

问题描述 投票:0回答:1

我在 Node js 中为我的团队构建了一个内部自定义 URL 缩短器,但是 当生成的短链接添加到 Instagram、slack 等中的消息时,平台会获取元标记来显示标题、图像和说明,以供预览以附加到消息中。

问题: 此预览请求当前在我的逻辑中被计为一次点击,因为对短链接的请求会在 301 重定向到长 URL 之前增加点击计数

如何稳健地过滤社交元预览请求?以便正确跟踪此构建工具的参与计数。

express request url-shortener
1个回答
0
投票

不是 100% 稳健,但目前这就是我决定解决这个问题的方式:

第 1 步:创建已知机器人用户代理列表

步骤 2:检查传入请求用户代理是否包含任何机器人用户代理,如果包含,则不增加点击次数,如果不包含,则意味着它不是机器人,然后增加点击次数

代码如下:

const KNOWN_BOTS_USER_AGENTS = [
'google favicon',
'google keyword',
'google page speed',
'google pp default',
'google search console',
'google structured data testing tool',
'google web preview',
'google-adwords',
'google-adwords-instant',
'google-read-aloud',
'google-structured-data-testing-tool',
'googlebot',
'googlebot-image',
'googlebot-mobile',
'googlebot-news',
'googlebot-video',
'googlebot/images',
'facebookexternalhit',
'facebookexternalhit/1.0',
'facebookexternalhit/1.1',
'facebookplatform',
'facebot',
'twitterbot',
'bingbot',
'bingpreview',
'slurp',
'duckduckbot',
'baiduspider',
'telegrambot',
'yandexbot',
'sogou',
'exabot',
'linkedinbot',
'embedly',
'quora link preview',
'showyoubot',
'outbrain',
'pinterestbot',
'pinterest/0.',
'developers.google.com/+/web/snippet',
'slackbot',
'vkshare',
'w3c_validator',
'redditbot',
'applebot',
'whatsapp',
'flipboard',
'tumblr',
'bitlybot',
'skypeuripreview',
'nuzzel',
'discordbot',
'qwantify',
'yahoo link preview',
'yahoo! slurp',
'yahoo! slurp china',
'yahoocachesystem',
'yahooysmcm',
'baidu',
'baiduspider-ads',
'baiduspider-cpro',
'baiduspider-favo',
'baiduspider-image',
'baiduspider-news',
'baiduspider-video',
'360spider',
'360spider-image',
'360spider-video',
'aboundex',
'accoona-ai-agent',
'acoon',
'acoonbot',
'addthis',
'addthis.com',
'adidxbot',
'admantx',
'adsbot-google',
'adsbot-google-mobile',
'adsbot-google-mobile-apps',
'ahc',
'ahc/2.0',
'ahrefsbot',
'aihit',
'airmail',
'akula',
'alexa',
'alexabot',
'amagi',
'androiddownloadmanager',
'anemone',
'apercite',
'apis-google',
'applenewsbot',
'aprcovi',
'arachmo',
'archive-com',
'archive.org_bot',
'aria2',
'ask jeeves/teoma',
'asterias',
'b-l-i-t-z-bot',
'backlink-check',
'base',
'bazqux',
'bdfetch',
'begunadvertising',
'bibnum.bnf',
'bigbozz',
'biglotron',
'binlar',
'bitrix link preview',
'blexbot',
'bloglovin',
'blogtrottr',
'boitho',
'boitho.com-dc',
'bolzplatz',
'browsershots',
'bubing',
'bublupbot',
'butterfly',
'buzzsumo',
'bytespider',
'capsulechecker',
'cc metadata scaper',
'ccbot',
'censysinspect',
'cerberian drtrs',
'cg-eye',
'changedetection',
'charlotte',
'checkhost',
'chrome-lighthouse',
'cispa vulnerability notification',
'cjnetworkquality',
'cliqzbot',
'cloudflare-alwaysonline',
'cloudinary',
'cmcm',
'coc coc',
'coccoc',
'coccocbot-image',
'coccocbot-web',
'collections-updater',
'commons-httpclient',
'comodo ssl checker',
'content crawler spider',
'convera',
'cookiereports.com',
'covario-ids',
'crawl',
'crawlforlove',
'crystalsemanticsbot',
'csimarket',
'curb',
'curl',
'custo',
'datacha0s',
'dataparksearch',
'dataprovider.com',
'daum',
'daumoa',
'dazoobot',
'deusu',
'digg',
'domainappender',
'dotbot',
'dotsemantic',
'downforeveryoneorjustme',
'drupact',
'duckduckgo-favicons-bot',
'earthcom',
'earthcom.info',
'easouspider',
'easy-thumb',
'ec2linkfinder',
'ecairn-grabber',
'eccp',
'econtext',
'electricmonk',
'erocheese',
'euripbot',
'europarchive.org',
'evc-batch',
'eventmachine httpclient',
'exploratodo',
'ezooms',
'fairshare',
'faraday v',
'fast enterprise crawler',
'fast-webcrawler',
'favicon',
'favorg',
'feed wrangler',
'feedbin',
'feedburner',
'feedchecker',
'feedfetcher-google',
'feedly',
'feedspot',
'feedwind',
'femtosearchbot',
'fetch',
'fetch api',
'fever',
'findlink',
'findthatfile',
'findxbot',
'flamingo_searchengine',
'flipboardbrowserproxy',
'fluffy',
'g00g1e',
'genieo',
'getprismatic.com',
'gigablast',
'gigablastopensource',
'gingercrawler',
'go-http-client',
'gofetch',
'gomezagent',
'goodzer',
'gotsitemonitor',
'gozilla',
'grapeshotcrawler',
'grouphigh',
'grub.org',
'gslfbot',
'gt::www',
'gtmetrix',
'h00p',
'haosouspider',
'hatena',
'hawkreader',
'heritrix',
'holmes',
'hootsuite',
'hosttracker',
'ht://check',
'htdig',
'http::lite',
'httrack',
'hubpages',
'hubspot connect',
'hubspot marketing grader',
'hyperzbozi.cz feeds',
'i2kconnect',
'ia_archiver',
'iaskspider',
'icc-crawler',
'ichiro',
'iecheck',
'iisbot',
'infegy',
'infohelfer',
'infoseek',
'infowizards reciprocal link system pro',
'instapaper',
'integromedb',
'iodc',
'ioi',
'ips-agent',
'iqdb',
'irokez',
'isitup.org',
'iskanie',
'istellabot',
'izsearch',
'james bot',
'janforman',
'jigsaw',
'jikespider',
'jobboersebot',
'js-kit',
'justview',
'k7mlwcbot',
'keepright openstreetmap checker',
'keycdn',
'kickfire',
'kimonolabs',
'kml-google',
'komodiabot',
'kouio',
'l.webis',
'larbin',
'libwww',
'liebaofast',
'link valet',
'linkcheck',
'linkdetox',
'linkdex',
'linkexaminer',
'linkpadbot',
'linktiger',
'linkvalet',
'lipperhey',
'lipperhey spider',
'livedoor check',
'loadimpactpageanalyzer',
'loadimpactrload',
'longurl api',
'ltx71',
'lwp-trivial',
'lycos',
'magpierss',
'mail.ru',
'mail.ru_bot',
'mandrill',
'marketinggrader',
'mediapartners-google',
'megaindex',
'megaindex.ru',
'metaheadersbot',
'metauri',
'metauri api',
'microsearch',
'microsoft office existence',
'microsoft office protocol discovery',
'microsoft windows network diagnostics',
'microsoft-rds',
'mindjet',
'miniflux',
'mixrankbot',
'mj12bot',
'mnogosearch',
'mogimogi',
'mojeek',
'mojeekbot',
'mojolicious',
'montools',
'moreover',
'morning paper',
'mowser',
'mrcgiguy',
'msfrontpage',
'mshots',
'msnbot',
'msnbot-media',
'msnbot-products',
'msrbot',
'mvaclient',
'nagios',
'najdi.si',
'netcraftsurveyagent',
'netlyzer fastprobe',
'netresearch',
'netresearchserver',
'netshelter contentscan',
'nettrack',
'netvibes',
'newsblur',
'newsgator',
'newsme',
'newspaper',
'ng-search',
'nineconnections',
'nineconnections.com',
'nlnz_iaharvester',
'nmap scripting engine',
'noyona',
'nusearch spider',
'nutch',
'nutchcvs',
'nworm',
'nymesis',
'oegp',
'offline explorer',
'omea reader',
'omgili',
'online domain tools',
'online link validator',
'online website link checker',
'opencalaissemanticproxy',
'openstat',
'openvas',
'optimizer',
'orangebot',
'orbiter',
'orgprobe',
'ow-02',
'ow.ly',
'owlin',
'owncloud news',
'page2rss',
'pagepeeker',
'pagesinventory',
'panopta',
'panscient',
'paperlibot',
'peew',
'phpcrawl',
'pinterest',
'piplbot',
'plukkie',
'pompos',
'postano',
'postpost',
'postrank',
'proximic',
'prtg network monitor',
'psbot',
'pump',
'python-httplib2',
'python-requests',
'python-urllib',
'qirina hurdler',
'qseero',
'radian6',
'rambler',
'rebelmouse',
'rel link checker lite',
'retrevopageanalyzer',
'riddler',
'robosourcer',
'ruby',
'sbider',
'scoutjet',
'scouturlmonitor',
'scrapy',
'scrubby',
'searchsight',
'semanticdiscovery',
'semanticjuice',
'semrushbot',
'seoengworldbot',
'seokicks',
'seopreview',
'seznam screenshot-generator',
'seznambot',
'shopwiki',
'sitebar',
'sitecondor',
'siteexplorer.info',
'siteinspector',
'slackbot-linkexpanding',
'sleuth',
'smartdownload',
'smtbot',
'snappy',
'snoopy',
'socialrankiobot',
'sogou blog',
'sogou head spider',
'sogou inst spider',
'sogou link spider',
'sogou news spider',
'sogou orion spider',
'sogou page spider',
'sogou partner spider',
'sogou pic spider',
'sogou spider',
'sogou spider2',
'sogou video spider',
'sogou web spider',
'sogou-test-spider',
'sonic',
'sortsite',
'sosospider',
'spaziodati',
'spbot',
'speedy',
'sputnikbot',
'sqworm',
'stackrambler',
'suggybot',
'summify',
'sysomos',
't0phackteam',
'tailrank',
'tarantula',
'teoma',
'the architext spider',
'the expert html source viewer',
'theoldreader.com',
'thumbshots',
'thumbsniper',
'tineye',
'tiny tiny rss',
'tomato bot',
'topster',
'touche.com',
'traackr.com',
'truwogps',
'tweetedtimes bot',
'tweetmemebot',
'twikle',
'twingly',
'twingly recon',
'unwindfetchor',
'updated',
'uptimebot',
'urlresolver',
'vagabondo',
'validator.nu',
'viber',
'vivante link checker',
'vortex',
'voyager',
'vyu2',
'wbsrch',
'web-archive-net.com.bot',
'webauto',
'webcollage',
'webcookies',
'webdoc',
'webimagecollector',
'webimages',
'webindex',
'webkit2png',
'webmastercoffee',
'webmeup-crawler',
'webmon',
'webscreenie',
'webster',
'webstripper',
'webthumbnail',
'wesee:ads/pagebot',
'wesee:search',
'whack',
'wire',
'woriobot',
'wotbox',
'wp engine site check',
'wprecon.com survey',
'wume_crawler',
'www-mechanize',
'xaldon_webspider',
'xenu link sleuth',
'xing-contenttabreceiver',
'xmlrpsee',
'xovibot',
'y!j',
'yacybot',
'yandeg',
'yandex',
'yandexadnet',
'yandexantivirus',
'yandexblogs',
'yandexcatalog',
'yandexdirect',
'yandexfavicons',
'yandexfordomain',
'yandeximageresizer',
'yandeximages',
'yandexmedia',
'yandexmetrika',
'yandexmobilebot',
'yandexnews',
'yandexscreenshotbot',
'yandexsearchconsole',
'yandexspravbot',
'yandexturbo',
'yandexverticals',
'yandexvideo',
'yandexwebmaster',
'yasaklibot',
'yeti',
'yioopbot',
'yisouspider',
'yo-yo',
'yoleo consumer',
'yooglifetchagent',
'yoozbot',
'youdaobot',
'zao',
'zemanta aggregator',
'zend_http_client',
'zoominfobot',
'zyborg',
]
const useragent = req.headers['user-agent'] || null // grab user-agent from request headers
let isBot = false
if (
    !useragent ||
    typeof useragent !== 'string' ||
    useragent.toLowerCase().trim().length === 0 ||
    useragent.toLowerCase().trim() === 'undefined' ||
    useragent.toLowerCase().trim() === 'null' ||
    useragent.toLowerCase().trim() === 'empty'
    ) {
     isBot = true
    }

if (!isBot && KNOWN_BOTS_USER_AGENTS.some((botUserAGent) => useragent.toLowerCase().trim().includes(botUserAGent.toLowerCase().trim())){
 isBot = true
}
if (isBot) {
    console.log('useragent is a bot --> ', useragent.toLowerCase().trim())
}
© www.soinside.com 2019 - 2024. All rights reserved.