feat(YouTube - Keyword filter): Add syntax to match whole keywords and not substrings (#681)

Co-authored-by: oSumAtrIX <johan.melkonyan1@web.de>
2024-11-30 15:52:55 +01:00 · 2024-08-30 17:38:44 -04:00 · 2024-08-30 17:38:44 -04:00 · 5314dd90d1
commit 5314dd90d1
parent db81332078
2 changed files with 253 additions and 32 deletions
--- a/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java
+++ b/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java
@ -2,6 +2,7 @@ package app.revanced.integrations.youtube.patches.components;
 import static app.revanced.integrations.shared.StringRef.str;
 import static app.revanced.integrations.youtube.shared.NavigationBar.NavigationButton;
 import static java.lang.Character.UnicodeBlock.*;
 import android.os.Build;
@ -10,9 +11,8 @@ import androidx.annotation.Nullable;
 import androidx.annotation.RequiresApi;
 import java.nio.charset.StandardCharsets;
-import java.util.Arrays;
+import java.util.LinkedHashMap;
-import java.util.LinkedHashSet;
+import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.atomic.AtomicReference;
 import app.revanced.integrations.shared.Logger;
@ -26,7 +26,7 @@ import app.revanced.integrations.youtube.shared.PlayerType;
 /**
 * <pre>
- * Allows hiding home feed and search results based on keywords and/or channel names.
+ * Allows hiding home feed and search results based on video title keywords and/or channel names.
 *
 * Limitations:
 * - Searching for a keyword phrase will give no search results.
@ -41,19 +41,14 @@ import app.revanced.integrations.youtube.shared.PlayerType;
 *   (ie: "mr beast" automatically filters "Mr Beast" and "MR BEAST").
 * - Keywords present in the layout or video data cannot be used as filters, otherwise all videos
 *   will always be hidden.  This patch checks for some words of these words.
 * - When using whole word syntax, some keywords may need additional pluralized variations.
 */
@SuppressWarnings("unused")
@RequiresApi(api = Build.VERSION_CODES.N)
 final class KeywordContentFilter extends Filter {
    /**
-     * Minimum keyword/phrase length to prevent excessively broad content filtering.
+     * Strings found in the buffer for every videos.  Full strings should be specified.
     */
    private static final int MINIMUM_KEYWORD_LENGTH = 3;
    /**
     * Strings found in the buffer for every videos.
     * Full strings should be specified, as they are compared using {@link String#contains(CharSequence)}.
     *
     * This list does not include every common buffer string, and this can be added/changed as needed.
     * Words must be entered with the exact casing as found in the buffer.
@ -88,7 +83,7 @@ final class KeywordContentFilter extends Filter {
            "search_vwc_description_transition_key",
            "g-high-recZ",
            // Text and litho components found in the buffer that belong to path filters.
-            "metadata.eml",
+            "expandable_metadata.eml",
            "thumbnail.eml",
            "avatar.eml",
            "overflow_button.eml",
@ -107,7 +102,8 @@ final class KeywordContentFilter extends Filter {
            "search_video_with_context.eml",
            "video_with_context.eml", // Subscription tab videos.
            "related_video_with_context.eml",
-            "video_lockup_with_attachment.eml", // A/B test for subscribed video.
+            // A/B test for subscribed video, and sometimes when tablet layout is enabled.
            "video_lockup_with_attachment.eml",
            "compact_video.eml",
            "inline_shorts",
            "shorts_video_cell",
@ -139,6 +135,12 @@ final class KeywordContentFilter extends Filter {
            "overflow_button.eml"
    );
    /**
     * Minimum keyword/phrase length to prevent excessively broad content filtering.
     * Only applies when not using whole word syntax.
     */
    private static final int MINIMUM_KEYWORD_LENGTH = 3;
    /**
     * Threshold for {@link #filteredVideosPercentage}
     * that indicates all or nearly all videos have been filtered.
@ -150,6 +152,8 @@ final class KeywordContentFilter extends Filter {
    private static final long ALL_VIDEOS_FILTERED_BACKOFF_MILLISECONDS = 60 * 1000; // 60 seconds
    private static final int UTF8_MAX_BYTE_COUNT = 4;
    /**
     * Rolling average of how many videos were filtered by a keyword.
     * Used to detect if a keyword passes the initial check against {@link #STRINGS_IN_EVERY_BUFFER}
@ -216,23 +220,167 @@ final class KeywordContentFilter extends Filter {
                capitalizeNext = false;
            }
        }
        return new String(codePoints, 0, codePoints.length);
    }
    /**
-     * @return If the phrase will will hide all videos. Not an exhaustive check.
+     * @return If the string contains any characters from languages that do not use spaces between words.
     */
-    private static boolean phrasesWillHideAllVideos(@NonNull String[] phrases) {
+    private static boolean isLanguageWithNoSpaces(String text) {
-        for (String commonString : STRINGS_IN_EVERY_BUFFER) {
+        for (int i = 0, length = text.length(); i < length;) {
-            if (Utils.containsAny(commonString, phrases)) {
+            final int codePoint = text.codePointAt(i);
            Character.UnicodeBlock block = Character.UnicodeBlock.of(codePoint);
            if (block == CJK_UNIFIED_IDEOGRAPHS // Chinese and Kanji
                    || block == HIRAGANA // Japanese Hiragana
                    || block == KATAKANA // Japanese Katakana
                    || block == THAI
                    || block == LAO
                    || block == MYANMAR
                    || block == KHMER
                    || block == TIBETAN) {
                return true;
            }
            i += Character.charCount(codePoint);
        }
        return false;
    }
    /**
     * @return If the phrase will hide all videos. Not an exhaustive check.
     */
    private static boolean phrasesWillHideAllVideos(@NonNull String[] phrases, boolean matchWholeWords) {
        for (String phrase : phrases) {
            for (String commonString : STRINGS_IN_EVERY_BUFFER) {
                if (matchWholeWords) {
                    byte[] commonStringBytes = commonString.getBytes(StandardCharsets.UTF_8);
                    int matchIndex = 0;
                    while (true) {
                        matchIndex = commonString.indexOf(phrase, matchIndex);
                        if (matchIndex < 0) break;
                        if (keywordMatchIsWholeWord(commonStringBytes, matchIndex, phrase.length())) {
                            return true;
                        }
                        matchIndex++;
                    }
                } else if (Utils.containsAny(commonString, phrases)) {
                    return true;
                }
            }
        }
        return false;
    }
    /**
     * @return If the start and end indexes are not surrounded by other letters.
     *         If the indexes are surrounded by numbers/symbols/punctuation it is considered a whole word.
     */
    private static boolean keywordMatchIsWholeWord(byte[] text, int keywordStartIndex, int keywordLength) {
        final Integer codePointBefore = getUtf8CodePointBefore(text, keywordStartIndex);
        if (codePointBefore != null && Character.isLetter(codePointBefore)) {
            return false;
        }
        final Integer codePointAfter = getUtf8CodePointAt(text, keywordStartIndex + keywordLength);
        //noinspection RedundantIfStatement
        if (codePointAfter != null && Character.isLetter(codePointAfter)) {
            return false;
        }
        return true;
    }
    /**
     * @return The UTF8 character point immediately before the index,
     *         or null if the bytes before the index is not a valid UTF8 character.
     */
    @Nullable
    private static Integer getUtf8CodePointBefore(byte[] data, int index) {
        int characterByteCount = 0;
        while (--index >= 0 && ++characterByteCount <= UTF8_MAX_BYTE_COUNT) {
            if (isValidUtf8(data, index, characterByteCount)) {
                return decodeUtf8ToCodePoint(data, index, characterByteCount);
            }
        }
        return null;
    }
    /**
     * @return The UTF8 character point at the index,
     *         or null if the index holds no valid UTF8 character.
     */
    @Nullable
    private static Integer getUtf8CodePointAt(byte[] data, int index) {
        int characterByteCount = 0;
        final int dataLength = data.length;
        while (index + characterByteCount < dataLength && ++characterByteCount <= UTF8_MAX_BYTE_COUNT) {
            if (isValidUtf8(data, index, characterByteCount)) {
                return decodeUtf8ToCodePoint(data, index, characterByteCount);
            }
        }
        return null;
    }
    public static boolean isValidUtf8(byte[] data, int startIndex, int numberOfBytes) {
        switch (numberOfBytes) {
            case 1: // 0xxxxxxx (ASCII)
                return (data[startIndex] & 0x80) == 0;
            case 2: // 110xxxxx, 10xxxxxx
                return (data[startIndex] & 0xE0) == 0xC0
                        && (data[startIndex + 1] & 0xC0) == 0x80;
            case 3: // 1110xxxx, 10xxxxxx, 10xxxxxx
                return (data[startIndex] & 0xF0) == 0xE0
                        && (data[startIndex + 1] & 0xC0) == 0x80
                        && (data[startIndex + 2] & 0xC0) == 0x80;
            case 4: // 11110xxx, 10xxxxxx, 10xxxxxx, 10xxxxxx
                return (data[startIndex] & 0xF8) == 0xF0
                        && (data[startIndex + 1] & 0xC0) == 0x80
                        && (data[startIndex + 2] & 0xC0) == 0x80
                        && (data[startIndex + 3] & 0xC0) == 0x80;
        }
        throw new IllegalArgumentException("numberOfBytes: " + numberOfBytes);
    }
    public static int decodeUtf8ToCodePoint(byte[] data, int startIndex, int numberOfBytes) {
        switch (numberOfBytes) {
            case 1:
                return data[startIndex];
            case 2:
                return ((data[startIndex] & 0x1F) << 6) |
                        (data[startIndex + 1] & 0x3F);
            case 3:
                return ((data[startIndex] & 0x0F) << 12) |
                        ((data[startIndex + 1] & 0x3F) << 6) |
                        (data[startIndex + 2] & 0x3F);
            case 4:
                return ((data[startIndex] & 0x07) << 18) |
                        ((data[startIndex + 1] & 0x3F) << 12) |
                        ((data[startIndex + 2] & 0x3F) << 6) |
                        (data[startIndex + 3] & 0x3F);
        }
        throw new IllegalArgumentException("numberOfBytes: " + numberOfBytes);
    }
    private static boolean phraseUsesWholeWordSyntax(String phrase) {
        return phrase.startsWith("\"") && phrase.endsWith("\"");
    }
    private static String stripWholeWordSyntax(String phrase) {
        return phrase.substring(1, phrase.length() - 1);
    }
    private synchronized void parseKeywords() { // Must be synchronized since Litho is multi-threaded.
        String rawKeywords = Settings.HIDE_KEYWORD_CONTENT_PHRASES.get();
        //noinspection StringEquality
        if (rawKeywords == lastKeywordPhrasesParsed) {
            Logger.printDebug(() -> "Using previously initialized search");
@ -243,20 +391,33 @@ final class KeywordContentFilter extends Filter {
        String[] split = rawKeywords.split("\n");
        if (split.length != 0) {
            // Linked Set so log statement are more organized and easier to read.
-            Set<String> keywords = new LinkedHashSet<>(10 * split.length);
+            // Map is: Phrase -> isWholeWord
            Map<String, Boolean> keywords = new LinkedHashMap<>(10 * split.length);
            for (String phrase : split) {
-                // Remove any trailing white space the user may have accidentally included.
+                // Remove any trailing spaces the user may have accidentally included.
                phrase = phrase.stripTrailing();
                if (phrase.isBlank()) continue;
-                if (phrase.length() < MINIMUM_KEYWORD_LENGTH) {
+                final boolean wholeWordMatching;
                if (phraseUsesWholeWordSyntax(phrase)) {
                    if (phrase.length() == 2) {
                        continue; // Empty "" phrase
                    }
                    phrase = stripWholeWordSyntax(phrase);
                    wholeWordMatching = true;
                } else if (phrase.length() < MINIMUM_KEYWORD_LENGTH && !isLanguageWithNoSpaces(phrase)) {
                    // Allow phrases of 1 and 2 characters if using a
                    // language that does not use spaces between words.
                    // Do not reset the setting. Keep the invalid keywords so the user can fix the mistake.
                    Utils.showToastLong(str("revanced_hide_keyword_toast_invalid_length", phrase, MINIMUM_KEYWORD_LENGTH));
                    continue;
                } else {
                    wholeWordMatching = false;
                }
-                // Add common casing that might appear.
+                // Common casing that might appear.
                //
                // This could be simplified by adding case insensitive search to the prefix search,
                // which is very simple to add to StringTreSearch for Unicode and ByteTrieSearch for ASCII.
@ -265,7 +426,7 @@ final class KeywordContentFilter extends Filter {
                // UTF-8 characters can be different byte lengths, which does
                // not allow comparing two different byte arrays using simple plain array indexes.
                //
-                // Instead add all common case variations of the words.
+                // Instead use all common case variations of the words.
                String[] phraseVariations = {
                        phrase,
                        phrase.toLowerCase(),
@ -273,20 +434,45 @@ final class KeywordContentFilter extends Filter {
                        capitalizeAllFirstLetters(phrase),
                        phrase.toUpperCase()
                };
-                if (phrasesWillHideAllVideos(phraseVariations)) {
+
-                    Utils.showToastLong(str("revanced_hide_keyword_toast_invalid_common", phrase));
+                if (phrasesWillHideAllVideos(phraseVariations, wholeWordMatching)) {
                    String toastMessage;
                    // If whole word matching is off, but would pass with on, then show a different toast.
                    if (!wholeWordMatching && !phrasesWillHideAllVideos(phraseVariations, true)) {
                        toastMessage = "revanced_hide_keyword_toast_invalid_common_whole_word_required";
                    } else {
                        toastMessage = "revanced_hide_keyword_toast_invalid_common";
                    }
                    Utils.showToastLong(str(toastMessage, phrase));
                    continue;
                }
-                keywords.addAll(Arrays.asList(phraseVariations));
+                for (String variation : phraseVariations) {
                    // Check if the same phrase is declared both with and without quotes.
                    Boolean existing = keywords.get(variation);
                    if (existing == null) {
                        keywords.put(variation, wholeWordMatching);
                    } else if (existing != wholeWordMatching) {
                        Utils.showToastLong(str("revanced_hide_keyword_toast_invalid_conflicting", phrase));
                        break;
                    }
                }
            }
-            for (String keyword : keywords) {
+            for (Map.Entry<String, Boolean> entry : keywords.entrySet()) {
-                // Use a callback to get the keyword that matched.
+                String keyword = entry.getKey();
-                // TrieSearch could have this built in, but that's slightly more complicated since
+                //noinspection ExtractMethodRecommender
-                // the strings are stored as a byte array and embedded in the search tree.
+                final boolean isWholeWord = entry.getValue();
                TrieSearch.TriePatternMatchedCallback<byte[]> callback =
-                        (textSearched, matchedStartIndex, matchedLength, callbackParameter) -> {
+                        (textSearched, startIndex, matchLength, callbackParameter) -> {
                            if (isWholeWord && !keywordMatchIsWholeWord(textSearched, startIndex, matchLength)) {
                                return false;
                            }
                            Logger.printDebug(() -> (isWholeWord ? "Matched whole keyword: '"
                                    : "Matched keyword: '") + keyword + "'");
                            // noinspection unchecked
                            ((MutableReference<String>) callbackParameter).value = keyword;
                            return true;
@ -295,7 +481,7 @@ final class KeywordContentFilter extends Filter {
                search.addPattern(stringBytes, callback);
            }
-            Logger.printDebug(() -> "Search using: (" + search.getEstimatedMemorySize() + " KB) keywords: " + keywords);
+            Logger.printDebug(() -> "Search using: (" + search.getEstimatedMemorySize() + " KB) keywords: " + keywords.keySet());
        }
        bufferSearch = search;
@ -382,7 +568,7 @@ final class KeywordContentFilter extends Filter {
        // Field is intentionally compared using reference equality.
        //noinspection StringEquality
        if (Settings.HIDE_KEYWORD_CONTENT_PHRASES.get() != lastKeywordPhrasesParsed) {
-            // User changed the keywords.
+            // User changed the keywords or whole word setting.
            parseKeywords();
        }
--- a/app/src/main/java/app/revanced/integrations/youtube/settings/preference/HtmlPreference.java
+++ b/app/src/main/java/app/revanced/integrations/youtube/settings/preference/HtmlPreference.java
@ -0,0 +1,35 @@
 package app.revanced.integrations.youtube.settings.preference;
 import static android.text.Html.FROM_HTML_MODE_COMPACT;
 import android.content.Context;
 import android.os.Build;
 import android.preference.Preference;
 import android.text.Html;
 import android.util.AttributeSet;
 import androidx.annotation.RequiresApi;
 /**
 * Allows using basic html for the summary text.
 */
@SuppressWarnings({"unused", "deprecation"})
@RequiresApi(api = Build.VERSION_CODES.O)
 public class HtmlPreference extends Preference {
    {
        setSummary(Html.fromHtml(getSummary().toString(), FROM_HTML_MODE_COMPACT));
    }
    public HtmlPreference(Context context, AttributeSet attrs, int defStyleAttr, int defStyleRes) {
        super(context, attrs, defStyleAttr, defStyleRes);
    }
    public HtmlPreference(Context context, AttributeSet attrs, int defStyleAttr) {
        super(context, attrs, defStyleAttr);
    }
    public HtmlPreference(Context context, AttributeSet attrs) {
        super(context, attrs);
    }
    public HtmlPreference(Context context) {
        super(context);
    }
 }