feat(YouTube - Keyword filter): Add syntax to match whole keywords and not substrings (#681)

Co-authored-by: oSumAtrIX <johan.melkonyan1@web.de>
This commit is contained in:
LisoUseInAIKyrios 2024-08-30 17:38:44 -04:00 committed by GitHub
parent db81332078
commit 5314dd90d1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 253 additions and 32 deletions

View File

@ -2,6 +2,7 @@ package app.revanced.integrations.youtube.patches.components;
import static app.revanced.integrations.shared.StringRef.str; import static app.revanced.integrations.shared.StringRef.str;
import static app.revanced.integrations.youtube.shared.NavigationBar.NavigationButton; import static app.revanced.integrations.youtube.shared.NavigationBar.NavigationButton;
import static java.lang.Character.UnicodeBlock.*;
import android.os.Build; import android.os.Build;
@ -10,9 +11,8 @@ import androidx.annotation.Nullable;
import androidx.annotation.RequiresApi; import androidx.annotation.RequiresApi;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.Arrays; import java.util.LinkedHashMap;
import java.util.LinkedHashSet; import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.atomic.AtomicReference;
import app.revanced.integrations.shared.Logger; import app.revanced.integrations.shared.Logger;
@ -26,7 +26,7 @@ import app.revanced.integrations.youtube.shared.PlayerType;
/** /**
* <pre> * <pre>
* Allows hiding home feed and search results based on keywords and/or channel names. * Allows hiding home feed and search results based on video title keywords and/or channel names.
* *
* Limitations: * Limitations:
* - Searching for a keyword phrase will give no search results. * - Searching for a keyword phrase will give no search results.
@ -41,19 +41,14 @@ import app.revanced.integrations.youtube.shared.PlayerType;
* (ie: "mr beast" automatically filters "Mr Beast" and "MR BEAST"). * (ie: "mr beast" automatically filters "Mr Beast" and "MR BEAST").
* - Keywords present in the layout or video data cannot be used as filters, otherwise all videos * - Keywords present in the layout or video data cannot be used as filters, otherwise all videos
* will always be hidden. This patch checks for some words of these words. * will always be hidden. This patch checks for some words of these words.
* - When using whole word syntax, some keywords may need additional pluralized variations.
*/ */
@SuppressWarnings("unused") @SuppressWarnings("unused")
@RequiresApi(api = Build.VERSION_CODES.N) @RequiresApi(api = Build.VERSION_CODES.N)
final class KeywordContentFilter extends Filter { final class KeywordContentFilter extends Filter {
/** /**
* Minimum keyword/phrase length to prevent excessively broad content filtering. * Strings found in the buffer for every videos. Full strings should be specified.
*/
private static final int MINIMUM_KEYWORD_LENGTH = 3;
/**
* Strings found in the buffer for every videos.
* Full strings should be specified, as they are compared using {@link String#contains(CharSequence)}.
* *
* This list does not include every common buffer string, and this can be added/changed as needed. * This list does not include every common buffer string, and this can be added/changed as needed.
* Words must be entered with the exact casing as found in the buffer. * Words must be entered with the exact casing as found in the buffer.
@ -88,7 +83,7 @@ final class KeywordContentFilter extends Filter {
"search_vwc_description_transition_key", "search_vwc_description_transition_key",
"g-high-recZ", "g-high-recZ",
// Text and litho components found in the buffer that belong to path filters. // Text and litho components found in the buffer that belong to path filters.
"metadata.eml", "expandable_metadata.eml",
"thumbnail.eml", "thumbnail.eml",
"avatar.eml", "avatar.eml",
"overflow_button.eml", "overflow_button.eml",
@ -107,7 +102,8 @@ final class KeywordContentFilter extends Filter {
"search_video_with_context.eml", "search_video_with_context.eml",
"video_with_context.eml", // Subscription tab videos. "video_with_context.eml", // Subscription tab videos.
"related_video_with_context.eml", "related_video_with_context.eml",
"video_lockup_with_attachment.eml", // A/B test for subscribed video. // A/B test for subscribed video, and sometimes when tablet layout is enabled.
"video_lockup_with_attachment.eml",
"compact_video.eml", "compact_video.eml",
"inline_shorts", "inline_shorts",
"shorts_video_cell", "shorts_video_cell",
@ -139,6 +135,12 @@ final class KeywordContentFilter extends Filter {
"overflow_button.eml" "overflow_button.eml"
); );
/**
* Minimum keyword/phrase length to prevent excessively broad content filtering.
* Only applies when not using whole word syntax.
*/
private static final int MINIMUM_KEYWORD_LENGTH = 3;
/** /**
* Threshold for {@link #filteredVideosPercentage} * Threshold for {@link #filteredVideosPercentage}
* that indicates all or nearly all videos have been filtered. * that indicates all or nearly all videos have been filtered.
@ -150,6 +152,8 @@ final class KeywordContentFilter extends Filter {
private static final long ALL_VIDEOS_FILTERED_BACKOFF_MILLISECONDS = 60 * 1000; // 60 seconds private static final long ALL_VIDEOS_FILTERED_BACKOFF_MILLISECONDS = 60 * 1000; // 60 seconds
private static final int UTF8_MAX_BYTE_COUNT = 4;
/** /**
* Rolling average of how many videos were filtered by a keyword. * Rolling average of how many videos were filtered by a keyword.
* Used to detect if a keyword passes the initial check against {@link #STRINGS_IN_EVERY_BUFFER} * Used to detect if a keyword passes the initial check against {@link #STRINGS_IN_EVERY_BUFFER}
@ -216,23 +220,167 @@ final class KeywordContentFilter extends Filter {
capitalizeNext = false; capitalizeNext = false;
} }
} }
return new String(codePoints, 0, codePoints.length); return new String(codePoints, 0, codePoints.length);
} }
/** /**
* @return If the phrase will will hide all videos. Not an exhaustive check. * @return If the string contains any characters from languages that do not use spaces between words.
*/ */
private static boolean phrasesWillHideAllVideos(@NonNull String[] phrases) { private static boolean isLanguageWithNoSpaces(String text) {
for (String commonString : STRINGS_IN_EVERY_BUFFER) { for (int i = 0, length = text.length(); i < length;) {
if (Utils.containsAny(commonString, phrases)) { final int codePoint = text.codePointAt(i);
Character.UnicodeBlock block = Character.UnicodeBlock.of(codePoint);
if (block == CJK_UNIFIED_IDEOGRAPHS // Chinese and Kanji
|| block == HIRAGANA // Japanese Hiragana
|| block == KATAKANA // Japanese Katakana
|| block == THAI
|| block == LAO
|| block == MYANMAR
|| block == KHMER
|| block == TIBETAN) {
return true; return true;
} }
i += Character.charCount(codePoint);
} }
return false; return false;
} }
/**
* @return If the phrase will hide all videos. Not an exhaustive check.
*/
private static boolean phrasesWillHideAllVideos(@NonNull String[] phrases, boolean matchWholeWords) {
for (String phrase : phrases) {
for (String commonString : STRINGS_IN_EVERY_BUFFER) {
if (matchWholeWords) {
byte[] commonStringBytes = commonString.getBytes(StandardCharsets.UTF_8);
int matchIndex = 0;
while (true) {
matchIndex = commonString.indexOf(phrase, matchIndex);
if (matchIndex < 0) break;
if (keywordMatchIsWholeWord(commonStringBytes, matchIndex, phrase.length())) {
return true;
}
matchIndex++;
}
} else if (Utils.containsAny(commonString, phrases)) {
return true;
}
}
}
return false;
}
/**
* @return If the start and end indexes are not surrounded by other letters.
* If the indexes are surrounded by numbers/symbols/punctuation it is considered a whole word.
*/
private static boolean keywordMatchIsWholeWord(byte[] text, int keywordStartIndex, int keywordLength) {
final Integer codePointBefore = getUtf8CodePointBefore(text, keywordStartIndex);
if (codePointBefore != null && Character.isLetter(codePointBefore)) {
return false;
}
final Integer codePointAfter = getUtf8CodePointAt(text, keywordStartIndex + keywordLength);
//noinspection RedundantIfStatement
if (codePointAfter != null && Character.isLetter(codePointAfter)) {
return false;
}
return true;
}
/**
* @return The UTF8 character point immediately before the index,
* or null if the bytes before the index is not a valid UTF8 character.
*/
@Nullable
private static Integer getUtf8CodePointBefore(byte[] data, int index) {
int characterByteCount = 0;
while (--index >= 0 && ++characterByteCount <= UTF8_MAX_BYTE_COUNT) {
if (isValidUtf8(data, index, characterByteCount)) {
return decodeUtf8ToCodePoint(data, index, characterByteCount);
}
}
return null;
}
/**
* @return The UTF8 character point at the index,
* or null if the index holds no valid UTF8 character.
*/
@Nullable
private static Integer getUtf8CodePointAt(byte[] data, int index) {
int characterByteCount = 0;
final int dataLength = data.length;
while (index + characterByteCount < dataLength && ++characterByteCount <= UTF8_MAX_BYTE_COUNT) {
if (isValidUtf8(data, index, characterByteCount)) {
return decodeUtf8ToCodePoint(data, index, characterByteCount);
}
}
return null;
}
public static boolean isValidUtf8(byte[] data, int startIndex, int numberOfBytes) {
switch (numberOfBytes) {
case 1: // 0xxxxxxx (ASCII)
return (data[startIndex] & 0x80) == 0;
case 2: // 110xxxxx, 10xxxxxx
return (data[startIndex] & 0xE0) == 0xC0
&& (data[startIndex + 1] & 0xC0) == 0x80;
case 3: // 1110xxxx, 10xxxxxx, 10xxxxxx
return (data[startIndex] & 0xF0) == 0xE0
&& (data[startIndex + 1] & 0xC0) == 0x80
&& (data[startIndex + 2] & 0xC0) == 0x80;
case 4: // 11110xxx, 10xxxxxx, 10xxxxxx, 10xxxxxx
return (data[startIndex] & 0xF8) == 0xF0
&& (data[startIndex + 1] & 0xC0) == 0x80
&& (data[startIndex + 2] & 0xC0) == 0x80
&& (data[startIndex + 3] & 0xC0) == 0x80;
}
throw new IllegalArgumentException("numberOfBytes: " + numberOfBytes);
}
public static int decodeUtf8ToCodePoint(byte[] data, int startIndex, int numberOfBytes) {
switch (numberOfBytes) {
case 1:
return data[startIndex];
case 2:
return ((data[startIndex] & 0x1F) << 6) |
(data[startIndex + 1] & 0x3F);
case 3:
return ((data[startIndex] & 0x0F) << 12) |
((data[startIndex + 1] & 0x3F) << 6) |
(data[startIndex + 2] & 0x3F);
case 4:
return ((data[startIndex] & 0x07) << 18) |
((data[startIndex + 1] & 0x3F) << 12) |
((data[startIndex + 2] & 0x3F) << 6) |
(data[startIndex + 3] & 0x3F);
}
throw new IllegalArgumentException("numberOfBytes: " + numberOfBytes);
}
private static boolean phraseUsesWholeWordSyntax(String phrase) {
return phrase.startsWith("\"") && phrase.endsWith("\"");
}
private static String stripWholeWordSyntax(String phrase) {
return phrase.substring(1, phrase.length() - 1);
}
private synchronized void parseKeywords() { // Must be synchronized since Litho is multi-threaded. private synchronized void parseKeywords() { // Must be synchronized since Litho is multi-threaded.
String rawKeywords = Settings.HIDE_KEYWORD_CONTENT_PHRASES.get(); String rawKeywords = Settings.HIDE_KEYWORD_CONTENT_PHRASES.get();
//noinspection StringEquality //noinspection StringEquality
if (rawKeywords == lastKeywordPhrasesParsed) { if (rawKeywords == lastKeywordPhrasesParsed) {
Logger.printDebug(() -> "Using previously initialized search"); Logger.printDebug(() -> "Using previously initialized search");
@ -243,20 +391,33 @@ final class KeywordContentFilter extends Filter {
String[] split = rawKeywords.split("\n"); String[] split = rawKeywords.split("\n");
if (split.length != 0) { if (split.length != 0) {
// Linked Set so log statement are more organized and easier to read. // Linked Set so log statement are more organized and easier to read.
Set<String> keywords = new LinkedHashSet<>(10 * split.length); // Map is: Phrase -> isWholeWord
Map<String, Boolean> keywords = new LinkedHashMap<>(10 * split.length);
for (String phrase : split) { for (String phrase : split) {
// Remove any trailing white space the user may have accidentally included. // Remove any trailing spaces the user may have accidentally included.
phrase = phrase.stripTrailing(); phrase = phrase.stripTrailing();
if (phrase.isBlank()) continue; if (phrase.isBlank()) continue;
if (phrase.length() < MINIMUM_KEYWORD_LENGTH) { final boolean wholeWordMatching;
if (phraseUsesWholeWordSyntax(phrase)) {
if (phrase.length() == 2) {
continue; // Empty "" phrase
}
phrase = stripWholeWordSyntax(phrase);
wholeWordMatching = true;
} else if (phrase.length() < MINIMUM_KEYWORD_LENGTH && !isLanguageWithNoSpaces(phrase)) {
// Allow phrases of 1 and 2 characters if using a
// language that does not use spaces between words.
// Do not reset the setting. Keep the invalid keywords so the user can fix the mistake. // Do not reset the setting. Keep the invalid keywords so the user can fix the mistake.
Utils.showToastLong(str("revanced_hide_keyword_toast_invalid_length", phrase, MINIMUM_KEYWORD_LENGTH)); Utils.showToastLong(str("revanced_hide_keyword_toast_invalid_length", phrase, MINIMUM_KEYWORD_LENGTH));
continue; continue;
} else {
wholeWordMatching = false;
} }
// Add common casing that might appear. // Common casing that might appear.
// //
// This could be simplified by adding case insensitive search to the prefix search, // This could be simplified by adding case insensitive search to the prefix search,
// which is very simple to add to StringTreSearch for Unicode and ByteTrieSearch for ASCII. // which is very simple to add to StringTreSearch for Unicode and ByteTrieSearch for ASCII.
@ -265,7 +426,7 @@ final class KeywordContentFilter extends Filter {
// UTF-8 characters can be different byte lengths, which does // UTF-8 characters can be different byte lengths, which does
// not allow comparing two different byte arrays using simple plain array indexes. // not allow comparing two different byte arrays using simple plain array indexes.
// //
// Instead add all common case variations of the words. // Instead use all common case variations of the words.
String[] phraseVariations = { String[] phraseVariations = {
phrase, phrase,
phrase.toLowerCase(), phrase.toLowerCase(),
@ -273,20 +434,45 @@ final class KeywordContentFilter extends Filter {
capitalizeAllFirstLetters(phrase), capitalizeAllFirstLetters(phrase),
phrase.toUpperCase() phrase.toUpperCase()
}; };
if (phrasesWillHideAllVideos(phraseVariations)) {
Utils.showToastLong(str("revanced_hide_keyword_toast_invalid_common", phrase)); if (phrasesWillHideAllVideos(phraseVariations, wholeWordMatching)) {
String toastMessage;
// If whole word matching is off, but would pass with on, then show a different toast.
if (!wholeWordMatching && !phrasesWillHideAllVideos(phraseVariations, true)) {
toastMessage = "revanced_hide_keyword_toast_invalid_common_whole_word_required";
} else {
toastMessage = "revanced_hide_keyword_toast_invalid_common";
}
Utils.showToastLong(str(toastMessage, phrase));
continue; continue;
} }
keywords.addAll(Arrays.asList(phraseVariations)); for (String variation : phraseVariations) {
// Check if the same phrase is declared both with and without quotes.
Boolean existing = keywords.get(variation);
if (existing == null) {
keywords.put(variation, wholeWordMatching);
} else if (existing != wholeWordMatching) {
Utils.showToastLong(str("revanced_hide_keyword_toast_invalid_conflicting", phrase));
break;
}
}
} }
for (String keyword : keywords) { for (Map.Entry<String, Boolean> entry : keywords.entrySet()) {
// Use a callback to get the keyword that matched. String keyword = entry.getKey();
// TrieSearch could have this built in, but that's slightly more complicated since //noinspection ExtractMethodRecommender
// the strings are stored as a byte array and embedded in the search tree. final boolean isWholeWord = entry.getValue();
TrieSearch.TriePatternMatchedCallback<byte[]> callback = TrieSearch.TriePatternMatchedCallback<byte[]> callback =
(textSearched, matchedStartIndex, matchedLength, callbackParameter) -> { (textSearched, startIndex, matchLength, callbackParameter) -> {
if (isWholeWord && !keywordMatchIsWholeWord(textSearched, startIndex, matchLength)) {
return false;
}
Logger.printDebug(() -> (isWholeWord ? "Matched whole keyword: '"
: "Matched keyword: '") + keyword + "'");
// noinspection unchecked // noinspection unchecked
((MutableReference<String>) callbackParameter).value = keyword; ((MutableReference<String>) callbackParameter).value = keyword;
return true; return true;
@ -295,7 +481,7 @@ final class KeywordContentFilter extends Filter {
search.addPattern(stringBytes, callback); search.addPattern(stringBytes, callback);
} }
Logger.printDebug(() -> "Search using: (" + search.getEstimatedMemorySize() + " KB) keywords: " + keywords); Logger.printDebug(() -> "Search using: (" + search.getEstimatedMemorySize() + " KB) keywords: " + keywords.keySet());
} }
bufferSearch = search; bufferSearch = search;
@ -382,7 +568,7 @@ final class KeywordContentFilter extends Filter {
// Field is intentionally compared using reference equality. // Field is intentionally compared using reference equality.
//noinspection StringEquality //noinspection StringEquality
if (Settings.HIDE_KEYWORD_CONTENT_PHRASES.get() != lastKeywordPhrasesParsed) { if (Settings.HIDE_KEYWORD_CONTENT_PHRASES.get() != lastKeywordPhrasesParsed) {
// User changed the keywords. // User changed the keywords or whole word setting.
parseKeywords(); parseKeywords();
} }

View File

@ -0,0 +1,35 @@
package app.revanced.integrations.youtube.settings.preference;
import static android.text.Html.FROM_HTML_MODE_COMPACT;
import android.content.Context;
import android.os.Build;
import android.preference.Preference;
import android.text.Html;
import android.util.AttributeSet;
import androidx.annotation.RequiresApi;
/**
* Allows using basic html for the summary text.
*/
@SuppressWarnings({"unused", "deprecation"})
@RequiresApi(api = Build.VERSION_CODES.O)
public class HtmlPreference extends Preference {
{
setSummary(Html.fromHtml(getSummary().toString(), FROM_HTML_MODE_COMPACT));
}
public HtmlPreference(Context context, AttributeSet attrs, int defStyleAttr, int defStyleRes) {
super(context, attrs, defStyleAttr, defStyleRes);
}
public HtmlPreference(Context context, AttributeSet attrs, int defStyleAttr) {
super(context, attrs, defStyleAttr);
}
public HtmlPreference(Context context, AttributeSet attrs) {
super(context, attrs);
}
public HtmlPreference(Context context) {
super(context);
}
}