0 votes
in Programming by (580 points)

the first url creates a list of all the names of the research paper I have added the headers also

import requests
from bs4 import BeautifulSoup
import csv
import time
from random import randint
import re
url = "https://papers.nips.cc/paper/2014"
user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0"
headers = {'User-Agent':user_agent
           ,'Accept-Language':"en-US,en;q=0.5"
          }
r = requests.get(url,headers=headers)
html  =r.content
soup = BeautifulSoup(html,'html.parser')
fin = []
url2 = "https://scholar.google.com/scholar?q="
anchors = soup.find_all('a')
count = 0

then the loop iterates over those links and fetch the number of citations I am constantly calling sleep method in each iterations but still after some times I am getting blocked

for an in anchors[80:]:
    y = an.text
    z = y.replace(" ","+")
    url3 = url2 + z
    print(url3)
    x = requests.get(url3, headers  = headers)
    ht  =  x.content
    soup2 = BeautifulSoup(ht,'html.parser')
    b = soup2.find_all('a', attrs = {'href': re.compile(r'\/scholar\?cites\=.*')})
    print(count)
    print(b)
    time.sleep(randint(10,20))
    count += 1
        

Please log in or register to answer this question.

...