feat(YouTube - Keyword filter): Add syntax to match whole keywords and not substrings (#681)

Co-authored-by: oSumAtrIX <johan.melkonyan1@web.de>
2024-11-30 15:52:55 +01:00 · 2024-08-30 17:38:44 -04:00 · 2024-08-30 17:38:44 -04:00 · 5314dd90d1
commit 5314dd90d1
parent db81332078
2 changed files with 253 additions and 32 deletions
--- a/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java
+++ b/app/src/main/java/app/revanced/integrations/youtube/patches/components/KeywordContentFilter.java
@ -2,6 +2,7 @@ package app.revanced.integrations.youtube.patches.components;

 import static app.revanced.integrations.shared.StringRef.str;
 import static app.revanced.integrations.youtube.shared.NavigationBar.NavigationButton;
+import static java.lang.Character.UnicodeBlock.*;

 import android.os.Build;

@ -10,9 +11,8 @@ import androidx.annotation.Nullable;
 import androidx.annotation.RequiresApi;

 import java.nio.charset.StandardCharsets;
-import java.util.Arrays;
-import java.util.LinkedHashSet;
-import java.util.Set;
+import java.util.LinkedHashMap;
+import java.util.Map;
 import java.util.concurrent.atomic.AtomicReference;

 import app.revanced.integrations.shared.Logger;
@ -26,7 +26,7 @@ import app.revanced.integrations.youtube.shared.PlayerType;

 /**
 * <pre>
- * Allows hiding home feed and search results based on keywords and/or channel names.
+ * Allows hiding home feed and search results based on video title keywords and/or channel names.
 *
 * Limitations:
 * - Searching for a keyword phrase will give no search results.
@ -41,19 +41,14 @@ import app.revanced.integrations.youtube.shared.PlayerType;
 *   (ie: "mr beast" automatically filters "Mr Beast" and "MR BEAST").
 * - Keywords present in the layout or video data cannot be used as filters, otherwise all videos
 *   will always be hidden.  This patch checks for some words of these words.
+ * - When using whole word syntax, some keywords may need additional pluralized variations.
 */
@SuppressWarnings("unused")
@RequiresApi(api = Build.VERSION_CODES.N)
 final class KeywordContentFilter extends Filter {

    /**
-     * Minimum keyword/phrase length to prevent excessively broad content filtering.
-     */
-    private static final int MINIMUM_KEYWORD_LENGTH = 3;
-
-    /**
-     * Strings found in the buffer for every videos.
-     * Full strings should be specified, as they are compared using {@link String#contains(CharSequence)}.
+     * Strings found in the buffer for every videos.  Full strings should be specified.
     *
     * This list does not include every common buffer string, and this can be added/changed as needed.
     * Words must be entered with the exact casing as found in the buffer.
@ -88,7 +83,7 @@ final class KeywordContentFilter extends Filter {
            "search_vwc_description_transition_key",
            "g-high-recZ",
            // Text and litho components found in the buffer that belong to path filters.
-            "metadata.eml",
+            "expandable_metadata.eml",
            "thumbnail.eml",
            "avatar.eml",
            "overflow_button.eml",
@ -107,7 +102,8 @@ final class KeywordContentFilter extends Filter {
            "search_video_with_context.eml",
            "video_with_context.eml", // Subscription tab videos.
            "related_video_with_context.eml",
-            "video_lockup_with_attachment.eml", // A/B test for subscribed video.
+            // A/B test for subscribed video, and sometimes when tablet layout is enabled.
+            "video_lockup_with_attachment.eml",
            "compact_video.eml",
            "inline_shorts",
            "shorts_video_cell",
@ -139,6 +135,12 @@ final class KeywordContentFilter extends Filter {
            "overflow_button.eml"
    );

+    /**
+     * Minimum keyword/phrase length to prevent excessively broad content filtering.
+     * Only applies when not using whole word syntax.
+     */
+    private static final int MINIMUM_KEYWORD_LENGTH = 3;
+
    /**
     * Threshold for {@link #filteredVideosPercentage}
     * that indicates all or nearly all videos have been filtered.
@ -150,6 +152,8 @@ final class KeywordContentFilter extends Filter {

    private static final long ALL_VIDEOS_FILTERED_BACKOFF_MILLISECONDS = 60 * 1000; // 60 seconds

+    private static final int UTF8_MAX_BYTE_COUNT = 4;
+
    /**
     * Rolling average of how many videos were filtered by a keyword.
     * Used to detect if a keyword passes the initial check against {@link #STRINGS_IN_EVERY_BUFFER}
@ -216,23 +220,167 @@ final class KeywordContentFilter extends Filter {
                capitalizeNext = false;
            }
        }
+
        return new String(codePoints, 0, codePoints.length);
    }

    /**
-     * @return If the phrase will will hide all videos. Not an exhaustive check.
+     * @return If the string contains any characters from languages that do not use spaces between words.
     */
-    private static boolean phrasesWillHideAllVideos(@NonNull String[] phrases) {
-        for (String commonString : STRINGS_IN_EVERY_BUFFER) {
-            if (Utils.containsAny(commonString, phrases)) {
+    private static boolean isLanguageWithNoSpaces(String text) {
+        for (int i = 0, length = text.length(); i < length;) {
+            final int codePoint = text.codePointAt(i);
+
+            Character.UnicodeBlock block = Character.UnicodeBlock.of(codePoint);
+            if (block == CJK_UNIFIED_IDEOGRAPHS // Chinese and Kanji
+                    || block == HIRAGANA // Japanese Hiragana
+                    || block == KATAKANA // Japanese Katakana
+                    || block == THAI
+                    || block == LAO
+                    || block == MYANMAR
+                    || block == KHMER
+                    || block == TIBETAN) {
                return true;
            }
+
+            i += Character.charCount(codePoint);
        }
+
        return false;
    }

+    /**
+     * @return If the phrase will hide all videos. Not an exhaustive check.
+     */
+    private static boolean phrasesWillHideAllVideos(@NonNull String[] phrases, boolean matchWholeWords) {
+        for (String phrase : phrases) {
+            for (String commonString : STRINGS_IN_EVERY_BUFFER) {
+                if (matchWholeWords) {
+                    byte[] commonStringBytes = commonString.getBytes(StandardCharsets.UTF_8);
+                    int matchIndex = 0;
+                    while (true) {
+                        matchIndex = commonString.indexOf(phrase, matchIndex);
+                        if (matchIndex < 0) break;
+
+                        if (keywordMatchIsWholeWord(commonStringBytes, matchIndex, phrase.length())) {
+                            return true;
+                        }
+
+                        matchIndex++;
+                    }
+                } else if (Utils.containsAny(commonString, phrases)) {
+                    return true;
+                }
+            }
+        }
+
+        return false;
+    }
+
+    /**
+     * @return If the start and end indexes are not surrounded by other letters.
+     *         If the indexes are surrounded by numbers/symbols/punctuation it is considered a whole word.
+     */
+    private static boolean keywordMatchIsWholeWord(byte[] text, int keywordStartIndex, int keywordLength) {
+        final Integer codePointBefore = getUtf8CodePointBefore(text, keywordStartIndex);
+        if (codePointBefore != null && Character.isLetter(codePointBefore)) {
+            return false;
+        }
+
+        final Integer codePointAfter = getUtf8CodePointAt(text, keywordStartIndex + keywordLength);
+        //noinspection RedundantIfStatement
+        if (codePointAfter != null && Character.isLetter(codePointAfter)) {
+            return false;
+        }
+
+        return true;
+    }
+
+    /**
+     * @return The UTF8 character point immediately before the index,
+     *         or null if the bytes before the index is not a valid UTF8 character.
+     */
+    @Nullable
+    private static Integer getUtf8CodePointBefore(byte[] data, int index) {
+        int characterByteCount = 0;
+        while (--index >= 0 && ++characterByteCount <= UTF8_MAX_BYTE_COUNT) {
+            if (isValidUtf8(data, index, characterByteCount)) {
+                return decodeUtf8ToCodePoint(data, index, characterByteCount);
+            }
+        }
+
+        return null;
+    }
+
+    /**
+     * @return The UTF8 character point at the index,
+     *         or null if the index holds no valid UTF8 character.
+     */
+    @Nullable
+    private static Integer getUtf8CodePointAt(byte[] data, int index) {
+        int characterByteCount = 0;
+        final int dataLength = data.length;
+        while (index + characterByteCount < dataLength && ++characterByteCount <= UTF8_MAX_BYTE_COUNT) {
+            if (isValidUtf8(data, index, characterByteCount)) {
+                return decodeUtf8ToCodePoint(data, index, characterByteCount);
+            }
+        }
+
+        return null;
+    }
+
+    public static boolean isValidUtf8(byte[] data, int startIndex, int numberOfBytes) {
+        switch (numberOfBytes) {
+            case 1: // 0xxxxxxx (ASCII)
+                return (data[startIndex] & 0x80) == 0;
+            case 2: // 110xxxxx, 10xxxxxx
+                return (data[startIndex] & 0xE0) == 0xC0
+                        && (data[startIndex + 1] & 0xC0) == 0x80;
+            case 3: // 1110xxxx, 10xxxxxx, 10xxxxxx
+                return (data[startIndex] & 0xF0) == 0xE0
+                        && (data[startIndex + 1] & 0xC0) == 0x80
+                        && (data[startIndex + 2] & 0xC0) == 0x80;
+            case 4: // 11110xxx, 10xxxxxx, 10xxxxxx, 10xxxxxx
+                return (data[startIndex] & 0xF8) == 0xF0
+                        && (data[startIndex + 1] & 0xC0) == 0x80
+                        && (data[startIndex + 2] & 0xC0) == 0x80
+                        && (data[startIndex + 3] & 0xC0) == 0x80;
+        }
+
+        throw new IllegalArgumentException("numberOfBytes: " + numberOfBytes);
+    }
+
+    public static int decodeUtf8ToCodePoint(byte[] data, int startIndex, int numberOfBytes) {
+        switch (numberOfBytes) {
+            case 1:
+                return data[startIndex];
+            case 2:
+                return ((data[startIndex] & 0x1F) << 6) |
+                        (data[startIndex + 1] & 0x3F);
+            case 3:
+                return ((data[startIndex] & 0x0F) << 12) |
+                        ((data[startIndex + 1] & 0x3F) << 6) |
+                        (data[startIndex + 2] & 0x3F);
+            case 4:
+                return ((data[startIndex] & 0x07) << 18) |
+                        ((data[startIndex + 1] & 0x3F) << 12) |
+                        ((data[startIndex + 2] & 0x3F) << 6) |
+                        (data[startIndex + 3] & 0x3F);
+        }
+        throw new IllegalArgumentException("numberOfBytes: " + numberOfBytes);
+    }
+
+    private static boolean phraseUsesWholeWordSyntax(String phrase) {
+        return phrase.startsWith("\"") && phrase.endsWith("\"");
+    }
+
+    private static String stripWholeWordSyntax(String phrase) {
+        return phrase.substring(1, phrase.length() - 1);
+    }
+
    private synchronized void parseKeywords() { // Must be synchronized since Litho is multi-threaded.
        String rawKeywords = Settings.HIDE_KEYWORD_CONTENT_PHRASES.get();
+
        //noinspection StringEquality
        if (rawKeywords == lastKeywordPhrasesParsed) {
            Logger.printDebug(() -> "Using previously initialized search");
@ -243,20 +391,33 @@ final class KeywordContentFilter extends Filter {
        String[] split = rawKeywords.split("\n");
        if (split.length != 0) {
            // Linked Set so log statement are more organized and easier to read.
-            Set<String> keywords = new LinkedHashSet<>(10 * split.length);
+            // Map is: Phrase -> isWholeWord
+            Map<String, Boolean> keywords = new LinkedHashMap<>(10 * split.length);

            for (String phrase : split) {
-                // Remove any trailing white space the user may have accidentally included.
+                // Remove any trailing spaces the user may have accidentally included.
                phrase = phrase.stripTrailing();
                if (phrase.isBlank()) continue;

-                if (phrase.length() < MINIMUM_KEYWORD_LENGTH) {
+                final boolean wholeWordMatching;
+                if (phraseUsesWholeWordSyntax(phrase)) {
+                    if (phrase.length() == 2) {
+                        continue; // Empty "" phrase
+                    }
+                    phrase = stripWholeWordSyntax(phrase);
+                    wholeWordMatching = true;
+                } else if (phrase.length() < MINIMUM_KEYWORD_LENGTH && !isLanguageWithNoSpaces(phrase)) {
+                    // Allow phrases of 1 and 2 characters if using a
+                    // language that does not use spaces between words.
+
                    // Do not reset the setting. Keep the invalid keywords so the user can fix the mistake.
                    Utils.showToastLong(str("revanced_hide_keyword_toast_invalid_length", phrase, MINIMUM_KEYWORD_LENGTH));
                    continue;
+                } else {
+                    wholeWordMatching = false;
                }

-                // Add common casing that might appear.
+                // Common casing that might appear.
                //
                // This could be simplified by adding case insensitive search to the prefix search,
                // which is very simple to add to StringTreSearch for Unicode and ByteTrieSearch for ASCII.
@ -265,7 +426,7 @@ final class KeywordContentFilter extends Filter {
                // UTF-8 characters can be different byte lengths, which does
                // not allow comparing two different byte arrays using simple plain array indexes.
                //
-                // Instead add all common case variations of the words.
+                // Instead use all common case variations of the words.
                String[] phraseVariations = {
                        phrase,
                        phrase.toLowerCase(),
@ -273,20 +434,45 @@ final class KeywordContentFilter extends Filter {
                        capitalizeAllFirstLetters(phrase),
                        phrase.toUpperCase()
                };
-                if (phrasesWillHideAllVideos(phraseVariations)) {
-                    Utils.showToastLong(str("revanced_hide_keyword_toast_invalid_common", phrase));
+
+                if (phrasesWillHideAllVideos(phraseVariations, wholeWordMatching)) {
+                    String toastMessage;
+                    // If whole word matching is off, but would pass with on, then show a different toast.
+                    if (!wholeWordMatching && !phrasesWillHideAllVideos(phraseVariations, true)) {
+                        toastMessage = "revanced_hide_keyword_toast_invalid_common_whole_word_required";
+                    } else {
+                        toastMessage = "revanced_hide_keyword_toast_invalid_common";
+                    }
+
+                    Utils.showToastLong(str(toastMessage, phrase));
                    continue;
                }

-                keywords.addAll(Arrays.asList(phraseVariations));
+                for (String variation : phraseVariations) {
+                    // Check if the same phrase is declared both with and without quotes.
+                    Boolean existing = keywords.get(variation);
+                    if (existing == null) {
+                        keywords.put(variation, wholeWordMatching);
+                    } else if (existing != wholeWordMatching) {
+                        Utils.showToastLong(str("revanced_hide_keyword_toast_invalid_conflicting", phrase));
+                        break;
+                    }
+                }
            }

-            for (String keyword : keywords) {
-                // Use a callback to get the keyword that matched.
-                // TrieSearch could have this built in, but that's slightly more complicated since
-                // the strings are stored as a byte array and embedded in the search tree.
+            for (Map.Entry<String, Boolean> entry : keywords.entrySet()) {
+                String keyword = entry.getKey();
+                //noinspection ExtractMethodRecommender
+                final boolean isWholeWord = entry.getValue();
+
                TrieSearch.TriePatternMatchedCallback<byte[]> callback =
-                        (textSearched, matchedStartIndex, matchedLength, callbackParameter) -> {
+                        (textSearched, startIndex, matchLength, callbackParameter) -> {
+                            if (isWholeWord && !keywordMatchIsWholeWord(textSearched, startIndex, matchLength)) {
+                                return false;
+                            }
+
+                            Logger.printDebug(() -> (isWholeWord ? "Matched whole keyword: '"
+                                    : "Matched keyword: '") + keyword + "'");
                            // noinspection unchecked
                            ((MutableReference<String>) callbackParameter).value = keyword;
                            return true;
@ -295,7 +481,7 @@ final class KeywordContentFilter extends Filter {
                search.addPattern(stringBytes, callback);
            }

-            Logger.printDebug(() -> "Search using: (" + search.getEstimatedMemorySize() + " KB) keywords: " + keywords);
+            Logger.printDebug(() -> "Search using: (" + search.getEstimatedMemorySize() + " KB) keywords: " + keywords.keySet());
        }

        bufferSearch = search;
@ -382,7 +568,7 @@ final class KeywordContentFilter extends Filter {
        // Field is intentionally compared using reference equality.
        //noinspection StringEquality
        if (Settings.HIDE_KEYWORD_CONTENT_PHRASES.get() != lastKeywordPhrasesParsed) {
-            // User changed the keywords.
+            // User changed the keywords or whole word setting.
            parseKeywords();
        }

--- a/app/src/main/java/app/revanced/integrations/youtube/settings/preference/HtmlPreference.java
+++ b/app/src/main/java/app/revanced/integrations/youtube/settings/preference/HtmlPreference.java
@ -0,0 +1,35 @@
+package app.revanced.integrations.youtube.settings.preference;
+
+import static android.text.Html.FROM_HTML_MODE_COMPACT;
+
+import android.content.Context;
+import android.os.Build;
+import android.preference.Preference;
+import android.text.Html;
+import android.util.AttributeSet;
+
+import androidx.annotation.RequiresApi;
+
+/**
+ * Allows using basic html for the summary text.
+ */
+@SuppressWarnings({"unused", "deprecation"})
+@RequiresApi(api = Build.VERSION_CODES.O)
+public class HtmlPreference extends Preference {
+    {
+        setSummary(Html.fromHtml(getSummary().toString(), FROM_HTML_MODE_COMPACT));
+    }
+
+    public HtmlPreference(Context context, AttributeSet attrs, int defStyleAttr, int defStyleRes) {
+        super(context, attrs, defStyleAttr, defStyleRes);
+    }
+    public HtmlPreference(Context context, AttributeSet attrs, int defStyleAttr) {
+        super(context, attrs, defStyleAttr);
+    }
+    public HtmlPreference(Context context, AttributeSet attrs) {
+        super(context, attrs);
+    }
+    public HtmlPreference(Context context) {
+        super(context);
+    }
+}