Extrahieren des Benutzernamens des Besitzers aus der verschachtelten Seite auf HuggingFace

Anonymous · Post by **Anonymous** » 29 Dec 2024, 12:08

Ich durchsuche das HuggingFace-Forschungsforum (https://discuss.huggingface.co/c/research/7/l/latest) mit Selenium. Ich konnte die folgenden Attribute erfolgreich von der Hauptseite des Forums extrahieren:

Aktivitätsdatum
Anzahl der Aufrufe
Anzahl der Antworten
Titel
URL

Ich stoße jedoch auf ein Problem, wenn ich versuche, den Benutzernamen des Eigentümers aus den einzelnen Themenseiten zu extrahieren. Der Benutzername des Eigentümers befindet sich auf einer verschachtelten Seite, die über die URL im Themenlink der Hauptseite zugänglich ist.
Auf der Hauptseite habe ich beispielsweise das folgende HTML-Snippet für a Thema:

Code: Select all

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import time

# Set up Chrome options to use headless mode (for Colab)
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument("--disable-popup-blocking")
chrome_options.add_argument("--ignore-certificate-errors")
chrome_options.add_argument("--incognito")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64;  x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)

# Set the path to chromedriver explicitly (installed by apt)
chrome_path = "/usr/bin/chromedriver"

# Initialize the WebDriver with the updated path
driver = webdriver.Chrome(options=chrome_options)

# Open the HuggingFace page
url = "https://discuss.huggingface.co/c/research/7/l/latest"  # URL for HuggingFace Issues
driver.get(url)

# Wait for the page to load
time.sleep(6)

def scrape_huggingface_issues():
titles_and_links = []
seen_titles_and_links = set()
owner = []
replies = []
views = []
activity = []

while True:
try:
# Find all issue rows (elements in the table)
elements = driver.find_elements(By.CSS_SELECTOR, 'tr.topic-list-item')

# Extract and store the titles, links, and other data
for elem in elements:
topic_id = elem.get_attribute("data-topic-id")
if topic_id in seen_titles_and_links:
continue

seen_titles_and_links.add(topic_id)

# Extract title and link
selected_title = elem.find_element(By.CSS_SELECTOR, 'a.title.raw-link.raw-topic-link')
title = selected_title.text.strip()
relative_link = selected_title.get_attribute('href')  # Get the relative URL from the href attribute
full_link = relative_link  # Construct the absolute URL (if needed)

# Extract replies count
try:
replies_elem = elem.find_element(By.CSS_SELECTOR, 'button.btn-link.posts-map.badge-posts')
replies_count = replies_elem.find_element(By.CSS_SELECTOR, 'span.number').text.strip()
except:
replies_count = "0"

# Extract views count
try:
views_elem = elem.find_element(By.CSS_SELECTOR, 'td.num.views.topic-list-data')
views_count = views_elem.find_element(By.CSS_SELECTOR, 'span.number').text.strip()
except:
views_count = "0"

# Extract activity (last activity)
try:
activity_elem = elem.find_element(By.CSS_SELECTOR, 'td.num.topic-list-data.age.activity')
activity_text = activity_elem.get_attribute('title').strip()
except:
activity_text = "N/A"

# Use the helper function to get the owner info from the topic page
owner_text = scrape_issue_details(relative_link)

# Store the extracted data in the lists
titles_and_links.append((title, full_link, owner_text, replies_count, views_count, activity_text))
seen_titles_and_links.add((title, full_link))  # Add to the seen set to avoid duplicates

# Scroll down to load more content (if the forum uses infinite scroll)
driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
time.sleep(3)  # Adjust based on loading speed

# Check if the "Next" button is available and click it
try:
next_button = driver.find_element(By.CSS_SELECTOR, 'a.next.page-numbers')
next_button.click()
time.sleep(3)  # Wait for the next page to load
except:
# If there's no "Next" button, exit the loop
print("No more pages to scrape.")
break

except Exception as e:
print(f"Error occurred:  {e}")
continue

return titles_and_links

def scrape_issue_details(url):
"""
Navigate to the topic page and scrape additional details like the owner's username.
"""
# Go to the topic page
driver.get(url)
time.sleep(3)  # Wait for the page to load

# Extract the owner's username
try:
owner_elem = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'span.first.username.new-user')))
owner_username_fetch = owner_elem.find_element(By.CSS_SELECTOR, 'a').text.strip()
owner_username = owner_elem.text.strip()  # Extract the username from the link
except Exception as e:
owner_username = "N/A"  # Default value if no owner found

return owner_username

# Scrape the HuggingFace issues across all pages
issues = scrape_huggingface_issues()

# Print the titles, links, and additional data (owner, replies, views, activity)
print("Scraped Titles, Links, Owner, Replies, Views, Activity:")
for i, (title, link, owner_text, replies_count, views_count, activity_text) in enumerate(issues, 1):
print(f"{i}: {title} - {link} - Owner: {owner_text} - Replies: {replies_count} - Views: {views_count} - Activity: {activity_text}")

# Close the browser
driver.quit()

Problem:
Ich kann den Benutzernamen des Eigentümers nicht von der einzelnen Themenseite abrufen. Nachdem ich der URL gefolgt bin, kann ich den Benutzernamen des Eigentümers nicht finden und extrahieren, obwohl ich seine Position im HTML kenne.

Code: Select all

[url=/t/model-that-can-generate-both-text-and-image-as-output/132209]Model that can generate both text and image as output[/url]

Der Benutzername des Eigentümers befindet sich auf der einzelnen Seite des Themas im folgenden HTML-Snippet:

Code: Select all

[url=/u/InsertOPUsername]InsertOPUsername[/url]

Was ich versucht habe:

Ich habe Driver.get(url) verwendet. um zu den einzelnen Themenseiten zu navigieren.
Ich habe versucht, den Benutzernamen mithilfe von WebDriverWait und dem richtigen CSS-Selektor (span.first.username.new-user a) zu finden.
Ich entferne erfolgreich andere Details wie Aktivität, Aufrufe und Antworten auf der Hauptseite, aber der Benutzername des Eigentümers kann nicht von der Themenseite abgerufen werden.

Extrahieren des Benutzernamens des Besitzers aus der verschachtelten Seite auf HuggingFace

Extrahieren des Benutzernamens des Besitzers aus der verschachtelten Seite auf HuggingFace ⇐ Python

Quick Reply