name: "Genius"
variables:
artist:
type: artist
filters:
- strip_diacritics
- lowercase
- [replace, "!!!", "chk-chik-chick"] # !!! (Chk Chk Chk) artist exception
- [regex, '(?<=\W|\s)+(feat.+|ft[\W\s]+|(f\.\s)).+', ""]
# ^ Strip F./ft/eat/uring + everything after
- [regex, '\s&(?=\s)', " and"] # Replace " &" with " and"
## ^ Currently superfluous (for ARTIST) bc plugin preemptively replaces "&" with "and"
### ^ But still necessary for <title> "&" replacement
- [regex, '\.+|,+|(\W+(?=$))|(^\W+)', ""] # Strip "." + "," + beginning/end of string non-word chars
- [regex, "'", ""] # Separate expression bc of annoying way YAML escapes '
- [regex, '(?<=[a-z0-9%])[^\sa-z0-9%]+(?=[a-z0-9%]+)', "-"]
# ^ Replace medial non-alphanumeric char(s) with single "-" e.g., f**k > f-k
- [regex, '((?<=\s)([^a-z0-9\s-])+(\s|\W)+)|((?<=\w)([^a-z0-9-])+(\s|\W)+)', " "]
# ^ Clean up any remaining successive non-alphanumeric chars before strip_nonascii
## ^ Long & stupid bc couldn't figure out YAML-friendly non-capturing "(?:...)"
### ^ EZ version: [regex, '\W+(?:\W)(?<!$)', " "]
- [strip_nonascii, -]
title:
type: title
filters: artist
config:
url: "http://genius.com/{artist}-{title}-lyrics"
pattern: ['<div\s+class="lyrics"[^>]*?>(?<lyrics>.*?)</div>', s]
post-filters:
- strip_html
- clean_spaces
- utf8_encode
name: "Musixmatch"
variables:
artist:
type: artist
filters:
- strip_diacritics
- lowercase
- [replace, "!!!", "artist-46206"] # !!! (Chk Chk Chk) artist exception
- [replace, "+/-", "p%m"] # +/- artist janky exception (step 1)
- [regex, '(?<=\W|\s)+(feat.+|ft[\W\s]+|(f\.\s)).+', ""]
# ^ Strip F./ft/eat/uring + everything after
- [regex, "'", "%27"] # URL encode "'" bc MM is cool w/ dat shit
- [regex, '\s&\s(?=the)', " and "] # ONLY if succeeded by "the", replace " & " with " and "
## ^ Currently superfluous bc plugin preemptively replaces "&" with "and"
### ^ So MM will choke if <artist> contains "& (?!the)"
- [regex, '(?<=[a-z0-9%])[^\sa-z0-9%]+(?=[a-z0-9%]+)', "-"]
# ^ Replace medial non-alphanumeric char(s) with single "-" (except "'") e.g., M.I.A. > M-I-A.
- [regex, '\W+(?=$)', ""] # Strip end-of-string non-word chars
- [regex, '((?<=\s)([^a-z0-9\s-])+(\s|\W)+)|((?<=\w)([^a-z0-9-])+(\s|\W)+)', " "]
# ^ Clean up any remaining successive non-alphanumeric char(s) before strip_nonascii
## ^ Long & stupid bc couldn't figure out YAML-friendly non-capturing "(?:...)"
### EZ version: [regex, '\W+(?:\W)(?<!$)', " "]
- [strip_nonascii, -]
- [replace, "-27", "%27"] # Fix "'" URL encoding after strip_nonascii
- [regex, '(?<=^)p-m', "-"] # Janky replace for artist +/- after strip_nonascii (step 2)
title:
type: title
filters: # Not using identical artist filters due to different treatment of "&" in titles vs artist names
# Musixmatch strips "&" from <title> ALWAYS; from <artist> too, *unless* "& the", then replaced with "and"
## ^ see "&" treatment note above, under 'artist'
- strip_diacritics
- lowercase
- [regex, '(?<=\W|\s)+(feat.+|ft[\W\s]+|(f\.\s)).+', ""]
# ^ Strip F./ft/eat/uring + everything after
- [regex, "'", "%27"] # URL encode "'" bc MM is cool w/ dat shit
- [regex, '(?<=[a-z0-9%])[^\sa-z0-9%]+(?=[a-z0-9%]+)', "-"]
# ^ Replace medial non-alphanumeric char(s) with single "-" (except "'") e.g., f**k > f-k
- [regex, '\W+(?=$)', ""] # Strip end-of-string non-word chars
- [regex, '((?<=\s)([^a-z0-9\s-])+(\s|\W)+)|((?<=\w)([^a-z0-9-])+(\s|\W)+)', " "]
# ^ Clean up any remaining successive non-alphanumeric char(s) before strip_nonascii
- [strip_nonascii, -]
- [replace, "-27", "%27"] # Fix "'" URL encoding after strip_nonascii
config:
url: "http://www.musixmatch.com/lyrics/{artist}/{title}"
pattern: ['<span\s+id="lyrics-html"[^>]*?>(?<lyrics>.*?)</span>', s]
post-filters:
- utf8_encode