Stellen Sie die Kundenanfrage vor, wenn Sie mehrere Websites im Fastapi Web Scraping -Service abkratzen
Posted: 12 Feb 2025, 11:51
Bei einem Fastapi -Web -Scraping -Dienst von einer Spring -Boot -Anwendung mit einem vorgetäuschten Client hängt die Anforderung bei Scrapersponse -Antwort = this.Scrapedate (); ohne Fehler zu werfen. Der Fastapi -Dienst startet den Schablonenvorgang neu, aber die Feder -Boot -Funktion geht nicht über den API -Aufruf hinaus. Das Problem tritt nur beim Schaben mehrerer Websites auf, während das Abkratzen einer einzelnen Website wie erwartet funktioniert.
Normalerweise dauert die Anforderung bis zu 1 Stunde, manchmal 2 Stunden. Startfunktion, die den Webcraping-Dienst aufruft: < /p>
Normalerweise dauert die Anforderung bis zu 1 Stunde, manchmal 2 Stunden. Startfunktion, die den Webcraping-Dienst aufruft: < /p>
Code: Select all
@Scheduled(cron = "0 40 16 * * ?")minutes)
public void scrape() {
log.info("Calling web scraping service...");
Instant start = Instant.now();
ScrapeResponse response = this.scrapeDate();
if (response == null) {
log.error("Failed to scrape the web");
return;
}
List scrappedArticles =
response.data().stream()
.filter(this::isValidArticle) // Check if the article is valid
.flatMap(
article -> {
boolean existsInResponse1 =
response.data().stream().anyMatch(a -> a.title().equals(article.title()));
if (existsInResponse1) {
return Stream.of(this.buildArticle(article), this.buildArticle(article));
} else {
return Stream.of(this.buildArticle(article));
}
})
.toList();
articleRepository.saveAll(scrappedArticles);
Instant end = Instant.now();
long durationInSeconds = end.getEpochSecond() - start.getEpochSecond();
long minutes = durationInSeconds / 60;
long seconds = durationInSeconds % 60;
log.info(
"Web scraping completed in {} minutes and {} seconds, Scrapped articles: {}",
minutes,
seconds,
scrappedArticles.size());
}
< /code>
Dies ist die vorliegende Konfigurationsdatei: < /p>
@Configuration
public class FeignClientConfig {
private final ObjectMapper objectMapper;
public FeignClientConfig(ObjectMapper objectMapper) {
this.objectMapper = objectMapper;
}
@Bean
public Retryer feignRetryer() {
return new Retryer.Default(100, 1000, 3); // Initial interval, max interval, max attempts
}
@Bean
public Request.Options options() {
return new Request.Options(
180, TimeUnit.MINUTES, // connectTimeout in minutes (3 hours)
180, TimeUnit.MINUTES, // readTimeout in minutes (3 hours)
true
);
}
@Bean
Logger.Level feignLoggerLevel() {
return Logger.Level.FULL;
}
@Bean
public Encoder feignEncoder() {
return new JacksonEncoder(objectMapper);
}
@Bean
public Decoder feignDecoder() {
return new JacksonDecoder(objectMapper);
}
}
< /code>
Und schließlich ist dies mein schneller API -Code: < /p>
@app.post("/scrape/news")
async def scrape_news_articles():
thematics_file_path = 'files/thematics.json'
thematics_data = load_items(thematics_file_path)
thematics = [speciality.name['fr'] for speciality in thematics_data]
try:
data = scrape_news_articles_function(thematics)
except requests.exceptions.ReadTimeout:
# Retry with base64 encoding
encoded_thematics = base64.b64encode(str(thematics).encode('utf-8')).decode('utf-8')
data = scrape_news_articles_function(encoded_thematics, base64_encoded=True)
return {"data": data}
< /code>
def scrape_news_articles_function(thematics, base64_encoded=False):
if base64_encoded:
thematics = base64.b64decode(thematics).decode('utf-8')
driver = configure_webdriver()
response = []
response.extend(scrape_data_business_news(thematics, driver))
response.extend(scrape_data_leconomiste(thematics, driver))
response.extend(scrape_data_kapitalis(thematics, driver))
response.extend(scrape_data_lapresse(thematics, driver)) #Yemchi
response.extend(scrape_data_le_temps(thematics, driver))
response.extend(scrape_data_sante_tunisie(thematics, driver))
response.extend(scrape_data_tuniscope(thematics, driver))
response.extend(scrape_data_tunisie_numerique(thematics, driver))
response.extend(scrape_data_webdo(thematics, driver)) #Yemchi
response.extend(scrape_data_unicef(thematics, driver))
driver.quit()
print("Scraping news articles done.")
return response