Stellen Sie die Kundenanfrage vor, wenn Sie mehrere Websites im Fastapi Web Scraping -Service abkratzenPython

Python-Programme
Anonymous
 Stellen Sie die Kundenanfrage vor, wenn Sie mehrere Websites im Fastapi Web Scraping -Service abkratzen

Post by Anonymous »

Bei einem Fastapi -Web -Scraping -Dienst von einer Spring -Boot -Anwendung mit einem vorgetäuschten Client hängt die Anforderung bei Scrapersponse -Antwort = this.Scrapedate (); ohne Fehler zu werfen. Der Fastapi -Dienst startet den Schablonenvorgang neu, aber die Feder -Boot -Funktion geht nicht über den API -Aufruf hinaus. Das Problem tritt nur beim Schaben mehrerer Websites auf, während das Abkratzen einer einzelnen Website wie erwartet funktioniert.
Normalerweise dauert die Anforderung bis zu 1 Stunde, manchmal 2 Stunden. Startfunktion, die den Webcraping-Dienst aufruft: < /p>

Code: Select all

@Scheduled(cron = "0 40 16 * * ?")minutes)
public void scrape() {
log.info("Calling web scraping service...");
Instant start = Instant.now();
ScrapeResponse response = this.scrapeDate();

if (response == null) {
log.error("Failed to scrape the web");
return;
}
List scrappedArticles =
response.data().stream()
.filter(this::isValidArticle) // Check if the article is valid
.flatMap(
article -> {
boolean existsInResponse1 =
response.data().stream().anyMatch(a -> a.title().equals(article.title()));
if (existsInResponse1) {
return Stream.of(this.buildArticle(article), this.buildArticle(article));
} else {
return Stream.of(this.buildArticle(article));
}
})
.toList();
articleRepository.saveAll(scrappedArticles);
Instant end = Instant.now();
long durationInSeconds = end.getEpochSecond() - start.getEpochSecond();
long minutes = durationInSeconds / 60;
long seconds = durationInSeconds % 60;
log.info(
"Web scraping completed in {} minutes and {} seconds, Scrapped articles: {}",
minutes,
seconds,
scrappedArticles.size());
}
< /code>
Dies ist die vorliegende Konfigurationsdatei: < /p>
@Configuration
public class FeignClientConfig {

private final ObjectMapper objectMapper;

public FeignClientConfig(ObjectMapper objectMapper) {
this.objectMapper = objectMapper;
}

@Bean
public Retryer feignRetryer() {
return new Retryer.Default(100, 1000, 3); // Initial interval, max interval, max attempts
}

@Bean
public Request.Options options() {
return new Request.Options(
180, TimeUnit.MINUTES, // connectTimeout in minutes (3 hours)
180, TimeUnit.MINUTES, // readTimeout in minutes (3 hours)
true
);
}

@Bean
Logger.Level feignLoggerLevel() {
return Logger.Level.FULL;
}

@Bean
public Encoder feignEncoder() {
return new JacksonEncoder(objectMapper);
}

@Bean
public Decoder feignDecoder() {
return new JacksonDecoder(objectMapper);
}
}
< /code>
Und schließlich ist dies mein schneller API -Code: < /p>
@app.post("/scrape/news")
async def scrape_news_articles():
thematics_file_path = 'files/thematics.json'

thematics_data = load_items(thematics_file_path)
thematics = [speciality.name['fr'] for speciality in thematics_data]

try:
data = scrape_news_articles_function(thematics)
except requests.exceptions.ReadTimeout:
# Retry with base64 encoding
encoded_thematics = base64.b64encode(str(thematics).encode('utf-8')).decode('utf-8')
data = scrape_news_articles_function(encoded_thematics, base64_encoded=True)
return {"data": data}
< /code>
def scrape_news_articles_function(thematics, base64_encoded=False):
if base64_encoded:
thematics = base64.b64decode(thematics).decode('utf-8')

driver = configure_webdriver()

response = []
response.extend(scrape_data_business_news(thematics, driver))
response.extend(scrape_data_leconomiste(thematics, driver))
response.extend(scrape_data_kapitalis(thematics, driver))
response.extend(scrape_data_lapresse(thematics, driver)) #Yemchi
response.extend(scrape_data_le_temps(thematics, driver))
response.extend(scrape_data_sante_tunisie(thematics, driver))
response.extend(scrape_data_tuniscope(thematics, driver))
response.extend(scrape_data_tunisie_numerique(thematics, driver))
response.extend(scrape_data_webdo(thematics, driver)) #Yemchi
response.extend(scrape_data_unicef(thematics, driver))

driver.quit()
print("Scraping news articles done.")

return response

Quick Reply

Change Text Case: 
   
  • Similar Topics
    Replies
    Views
    Last post