Skip to content

Commit

Permalink
fix(SoFIFA): scrape player ratings (#804)
Browse files Browse the repository at this point in the history
---------

Co-authored-by: Francisco Guerreiro <[email protected]>
  • Loading branch information
franciscofguerreiro and Francisco Guerreiro authored Feb 9, 2025
1 parent 192bddd commit 5d68618
Showing 1 changed file with 17 additions and 11 deletions.
28 changes: 17 additions & 11 deletions soccerdata/sofifa.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,16 +475,22 @@ def read_player_ratings(
"player": before_br if before_br else after_br,
**version.to_dict(),
}
for s in score_labels:
nodes = tree.xpath(
f"(//li[not(self::script)] | //div | //p)[.//text()[contains(.,'{s}')]]//em"
)
# for multiple matches, only accept first match
if len(nodes) >= 1:
scores[s] = nodes[0].text.strip()
# if there's no match, put NA
else:
scores[s] = None
ratings.append(scores)

# Try each XPath until one returns a result
for s in score_labels:
value = None
xpaths = [
f"//p[.//text()[contains(.,'{s}')]]/span/em",
f"//div[contains(.,'{s}')]/em",
f"//li[not(self::script)][.//text()[contains(.,'{s}')]]/em",
]
for xpath in xpaths:
nodes = tree.xpath(xpath)
if nodes: # If at least one match is found
value = nodes[0].text.strip() # Take only the first match
break # Stop checking other XPaths once we find a valid value

scores[s] = value if value is not None else None # Assign only once
ratings.append(scores)
# return data frame
return pd.DataFrame(ratings).pipe(standardize_colnames).set_index(["player"]).sort_index()

0 comments on commit 5d68618

Please sign in to comment.