8 files changed, 456 insertions, 0 deletions
diff --git a/lib/pleroma/web/rich_media/helpers.ex b/lib/pleroma/web/rich_media/helpers.ex
new file mode 100644
index 0000000..0488df3
--- /dev/null
+++ b/lib/pleroma/web/rich_media/helpers.ex
@@ -0,0 +1,130 @@
+# Pleroma: A lightweight social networking server
+# Copyright © 2017-2022 Pleroma Authors <https://pleroma.social/>
+# SPDX-License-Identifier: AGPL-3.0-only
+
+defmodule Pleroma.Web.RichMedia.Helpers do
+  alias Pleroma.Activity
+  alias Pleroma.Config
+  alias Pleroma.HTML
+  alias Pleroma.Object
+  alias Pleroma.Web.RichMedia.Parser
+
+  @options [
+    pool: :media,
+    max_body: 2_000_000,
+    recv_timeout: 2_000
+  ]
+
+  @spec validate_page_url(URI.t() | binary()) :: :ok | :error
+  defp validate_page_url(page_url) when is_binary(page_url) do
+    validate_tld = Config.get([Pleroma.Formatter, :validate_tld])
+
+    page_url
+    |> Linkify.Parser.url?(validate_tld: validate_tld)
+    |> parse_uri(page_url)
+  end
+
+  defp validate_page_url(%URI{host: host, scheme: "https", authority: authority})
+       when is_binary(authority) do
+    cond do
+      host in Config.get([:rich_media, :ignore_hosts], []) ->
+        :error
+
+      get_tld(host) in Config.get([:rich_media, :ignore_tld], []) ->
+        :error
+
+      true ->
+        :ok
+    end
+  end
+
+  defp validate_page_url(_), do: :error
+
+  defp parse_uri(true, url) do
+    url
+    |> URI.parse()
+    |> validate_page_url
+  end
+
+  defp parse_uri(_, _), do: :error
+
+  defp get_tld(host) do
+    host
+    |> String.split(".")
+    |> Enum.reverse()
+    |> hd
+  end
+
+  def fetch_data_for_object(object) do
+    with true <- Config.get([:rich_media, :enabled]),
+         {:ok, page_url} <-
+           HTML.extract_first_external_url_from_object(object),
+         :ok <- validate_page_url(page_url),
+         {:ok, rich_media} <- Parser.parse(page_url) do
+      %{page_url: page_url, rich_media: rich_media}
+    else
+      _ -> %{}
+    end
+  end
+
+  def fetch_data_for_activity(%Activity{data: %{"type" => "Create"}} = activity) do
+    with true <- Config.get([:rich_media, :enabled]),
+         %Object{} = object <- Object.normalize(activity, fetch: false) do
+      fetch_data_for_object(object)
+    else
+      _ -> %{}
+    end
+  end
+
+  def fetch_data_for_activity(_), do: %{}
+
+  def rich_media_get(url) do
+    headers = [{"user-agent", Pleroma.Application.user_agent() <> "; Bot"}]
+
+    head_check =
+      case Pleroma.HTTP.head(url, headers, @options) do
+        # If the HEAD request didn't reach the server for whatever reason,
+        # we assume the GET that comes right after won't either
+        {:error, _} = e ->
+          e
+
+        {:ok, %Tesla.Env{status: 200, headers: headers}} ->
+          with :ok <- check_content_type(headers),
+               :ok <- check_content_length(headers),
+               do: :ok
+
+        _ ->
+          :ok
+      end
+
+    with :ok <- head_check, do: Pleroma.HTTP.get(url, headers, @options)
+  end
+
+  defp check_content_type(headers) do
+    case List.keyfind(headers, "content-type", 0) do
+      {_, content_type} ->
+        case Plug.Conn.Utils.media_type(content_type) do
+          {:ok, "text", "html", _} -> :ok
+          _ -> {:error, {:content_type, content_type}}
+        end
+
+      _ ->
+        :ok
+    end
+  end
+
+  @max_body @options[:max_body]
+  defp check_content_length(headers) do
+    case List.keyfind(headers, "content-length", 0) do
+      {_, maybe_content_length} ->
+        case Integer.parse(maybe_content_length) do
+          {content_length, ""} when content_length <= @max_body -> :ok
+          {_, ""} -> {:error, :body_too_large}
+          _ -> :ok
+        end
+
+      _ ->
+        :ok
+    end
+  end
+end
diff --git a/lib/pleroma/web/rich_media/parser.ex b/lib/pleroma/web/rich_media/parser.ex
new file mode 100644
index 0000000..dbe81ea
--- /dev/null
+++ b/lib/pleroma/web/rich_media/parser.ex
@@ -0,0 +1,169 @@
+# Pleroma: A lightweight social networking server
+# Copyright © 2017-2022 Pleroma Authors <https://pleroma.social/>
+# SPDX-License-Identifier: AGPL-3.0-only
+
+defmodule Pleroma.Web.RichMedia.Parser do
+  require Logger
+
+  @cachex Pleroma.Config.get([:cachex, :provider], Cachex)
+
+  defp parsers do
+    Pleroma.Config.get([:rich_media, :parsers])
+  end
+
+  def parse(nil), do: {:error, "No URL provided"}
+
+  if Pleroma.Config.get(:env) == :test do
+    @spec parse(String.t()) :: {:ok, map()} | {:error, any()}
+    def parse(url), do: parse_url(url)
+  else
+    @spec parse(String.t()) :: {:ok, map()} | {:error, any()}
+    def parse(url) do
+      with {:ok, data} <- get_cached_or_parse(url),
+           {:ok, _} <- set_ttl_based_on_image(data, url) do
+        {:ok, data}
+      end
+    end
+
+    defp get_cached_or_parse(url) do
+      case @cachex.fetch(:rich_media_cache, url, fn ->
+             case parse_url(url) do
+               {:ok, _} = res ->
+                 {:commit, res}
+
+               {:error, reason} = e ->
+                 # Unfortunately we have to log errors here, instead of doing that
+                 # along with ttl setting at the bottom. Otherwise we can get log spam
+                 # if more than one process was waiting for the rich media card
+                 # while it was generated. Ideally we would set ttl here as well,
+                 # so we don't override it number_of_waiters_on_generation
+                 # times, but one, obviously, can't set ttl for not-yet-created entry
+                 # and Cachex doesn't support returning ttl from the fetch callback.
+                 log_error(url, reason)
+                 {:commit, e}
+             end
+           end) do
+        {action, res} when action in [:commit, :ok] ->
+          case res do
+            {:ok, _data} = res ->
+              res
+
+            {:error, reason} = e ->
+              if action == :commit, do: set_error_ttl(url, reason)
+              e
+          end
+
+        {:error, e} ->
+          {:error, {:cachex_error, e}}
+      end
+    end
+
+    defp set_error_ttl(_url, :body_too_large), do: :ok
+    defp set_error_ttl(_url, {:content_type, _}), do: :ok
+
+    # The TTL is not set for the errors above, since they are unlikely to change
+    # with time
+
+    defp set_error_ttl(url, _reason) do
+      ttl = Pleroma.Config.get([:rich_media, :failure_backoff], 60_000)
+      @cachex.expire(:rich_media_cache, url, ttl)
+      :ok
+    end
+
+    defp log_error(url, {:invalid_metadata, data}) do
+      Logger.debug(fn -> "Incomplete or invalid metadata for #{url}: #{inspect(data)}" end)
+    end
+
+    defp log_error(url, reason) do
+      Logger.warn(fn -> "Rich media error for #{url}: #{inspect(reason)}" end)
+    end
+  end
+
+  @doc """
+  Set the rich media cache based on the expiration time of image.
+
+  Adopt behaviour `Pleroma.Web.RichMedia.Parser.TTL`
+
+  ## Example
+
+      defmodule MyModule do
+        @behaviour Pleroma.Web.RichMedia.Parser.TTL
+        def ttl(data, url) do
+          image_url = Map.get(data, :image)
+          # do some parsing in the url and get the ttl of the image
+          # and return ttl is unix time
+          parse_ttl_from_url(image_url)
+        end
+      end
+
+  Define the module in the config
+
+      config :pleroma, :rich_media,
+        ttl_setters: [MyModule]
+  """
+  @spec set_ttl_based_on_image(map(), String.t()) ::
+          {:ok, Integer.t() | :noop} | {:error, :no_key}
+  def set_ttl_based_on_image(data, url) do
+    case get_ttl_from_image(data, url) do
+      {:ok, ttl} when is_number(ttl) ->
+        ttl = ttl * 1000
+
+        case @cachex.expire_at(:rich_media_cache, url, ttl) do
+          {:ok, true} -> {:ok, ttl}
+          {:ok, false} -> {:error, :no_key}
+        end
+
+      _ ->
+        {:ok, :noop}
+    end
+  end
+
+  defp get_ttl_from_image(data, url) do
+    [:rich_media, :ttl_setters]
+    |> Pleroma.Config.get()
+    |> Enum.reduce({:ok, nil}, fn
+      module, {:ok, _ttl} ->
+        module.ttl(data, url)
+
+      _, error ->
+        error
+    end)
+  end
+
+  def parse_url(url) do
+    with {:ok, %Tesla.Env{body: html}} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url),
+         {:ok, html} <- Floki.parse_document(html) do
+      html
+      |> maybe_parse()
+      |> Map.put("url", url)
+      |> clean_parsed_data()
+      |> check_parsed_data()
+    end
+  end
+
+  defp maybe_parse(html) do
+    Enum.reduce_while(parsers(), %{}, fn parser, acc ->
+      case parser.parse(html, acc) do
+        data when data != %{} -> {:halt, data}
+        _ -> {:cont, acc}
+      end
+    end)
+  end
+
+  defp check_parsed_data(%{"title" => title} = data)
+       when is_binary(title) and title != "" do
+    {:ok, data}
+  end
+
+  defp check_parsed_data(data) do
+    {:error, {:invalid_metadata, data}}
+  end
+
+  defp clean_parsed_data(data) do
+    data
+    |> Enum.reject(fn {key, val} ->
+      not match?({:ok, _}, Jason.encode(%{key => val}))
+    end)
+    |> Map.new()
+  end
+end
diff --git a/lib/pleroma/web/rich_media/parser/ttl.ex b/lib/pleroma/web/rich_media/parser/ttl.ex
new file mode 100644
index 0000000..59d7f87
--- /dev/null
+++ b/lib/pleroma/web/rich_media/parser/ttl.ex
@@ -0,0 +1,7 @@
+# Pleroma: A lightweight social networking server
+# Copyright © 2017-2022 Pleroma Authors <https://pleroma.social/>
+# SPDX-License-Identifier: AGPL-3.0-only
+
+defmodule Pleroma.Web.RichMedia.Parser.TTL do
+  @callback ttl(Map.t(), String.t()) :: Integer.t() | nil
+end
diff --git a/lib/pleroma/web/rich_media/parser/ttl/aws_signed_url.ex b/lib/pleroma/web/rich_media/parser/ttl/aws_signed_url.ex
new file mode 100644
index 0000000..fa41c16
--- /dev/null
+++ b/lib/pleroma/web/rich_media/parser/ttl/aws_signed_url.ex
@@ -0,0 +1,50 @@
+# Pleroma: A lightweight social networking server
+# Copyright © 2017-2022 Pleroma Authors <https://pleroma.social/>
+# SPDX-License-Identifier: AGPL-3.0-only
+
+defmodule Pleroma.Web.RichMedia.Parser.TTL.AwsSignedUrl do
+  @behaviour Pleroma.Web.RichMedia.Parser.TTL
+
+  @impl true
+  def ttl(data, _url) do
+    image = Map.get(data, :image)
+
+    if is_aws_signed_url(image) do
+      image
+      |> parse_query_params()
+      |> format_query_params()
+      |> get_expiration_timestamp()
+    else
+      {:error, "Not aws signed url #{inspect(image)}"}
+    end
+  end
+
+  defp is_aws_signed_url(image) when is_binary(image) and image != "" do
+    %URI{host: host, query: query} = URI.parse(image)
+
+    String.contains?(host, "amazonaws.com") and String.contains?(query, "X-Amz-Expires")
+  end
+
+  defp is_aws_signed_url(_), do: nil
+
+  defp parse_query_params(image) do
+    %URI{query: query} = URI.parse(image)
+    query
+  end
+
+  defp format_query_params(query) do
+    query
+    |> String.split(~r/&|=/)
+    |> Enum.chunk_every(2)
+    |> Map.new(fn [k, v] -> {k, v} end)
+  end
+
+  defp get_expiration_timestamp(params) when is_map(params) do
+    {:ok, date} =
+      params
+      |> Map.get("X-Amz-Date")
+      |> Timex.parse("{ISO:Basic:Z}")
+
+    {:ok, Timex.to_unix(date) + String.to_integer(Map.get(params, "X-Amz-Expires"))}
+  end
+end
diff --git a/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex b/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex
new file mode 100644
index 0000000..320a5f5
--- /dev/null
+++ b/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex
@@ -0,0 +1,46 @@
+# Pleroma: A lightweight social networking server
+# Copyright © 2017-2022 Pleroma Authors <https://pleroma.social/>
+# SPDX-License-Identifier: AGPL-3.0-only
+
+defmodule Pleroma.Web.RichMedia.Parsers.MetaTagsParser do
+  def parse(data, html, prefix, key_name, value_name \\ "content") do
+    html
+    |> get_elements(key_name, prefix)
+    |> Enum.reduce(data, fn el, acc ->
+      attributes = normalize_attributes(el, prefix, key_name, value_name)
+
+      Map.merge(acc, attributes)
+    end)
+    |> maybe_put_title(html)
+  end
+
+  defp get_elements(html, key_name, prefix) do
+    html |> Floki.find("meta[#{key_name}^='#{prefix}:']")
+  end
+
+  defp normalize_attributes(html_node, prefix, key_name, value_name) do
+    {_tag, attributes, _children} = html_node
+
+    data =
+      Map.new(attributes, fn {name, value} ->
+        {name, String.trim_leading(value, "#{prefix}:")}
+      end)
+
+    %{data[key_name] => data[value_name]}
+  end
+
+  defp maybe_put_title(%{"title" => _} = meta, _), do: meta
+
+  defp maybe_put_title(meta, html) when meta != %{} do
+    case get_page_title(html) do
+      "" -> meta
+      title -> Map.put_new(meta, "title", title)
+    end
+  end
+
+  defp maybe_put_title(meta, _), do: meta
+
+  defp get_page_title(html) do
+    Floki.find(html, "html head title") |> List.first() |> Floki.text()
+  end
+end
diff --git a/lib/pleroma/web/rich_media/parsers/o_embed.ex b/lib/pleroma/web/rich_media/parsers/o_embed.ex
new file mode 100644
index 0000000..0f30317
--- /dev/null
+++ b/lib/pleroma/web/rich_media/parsers/o_embed.ex
@@ -0,0 +1,29 @@
+# Pleroma: A lightweight social networking server
+# Copyright © 2017-2022 Pleroma Authors <https://pleroma.social/>
+# SPDX-License-Identifier: AGPL-3.0-only
+
+defmodule Pleroma.Web.RichMedia.Parsers.OEmbed do
+  def parse(html, _data) do
+    with elements = [_ | _] <- get_discovery_data(html),
+         oembed_url when is_binary(oembed_url) <- get_oembed_url(elements),
+         {:ok, oembed_data = %{"html" => html}} <- get_oembed_data(oembed_url) do
+      %{oembed_data | "html" => Pleroma.HTML.filter_tags(html)}
+    else
+      _e -> %{}
+    end
+  end
+
+  defp get_discovery_data(html) do
+    html |> Floki.find("link[type='application/json+oembed']")
+  end
+
+  defp get_oembed_url([{"link", attributes, _children} | _]) do
+    Enum.find_value(attributes, fn {k, v} -> if k == "href", do: v end)
+  end
+
+  defp get_oembed_data(url) do
+    with {:ok, %Tesla.Env{body: json}} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url) do
+      Jason.decode(json)
+    end
+  end
+end
diff --git a/lib/pleroma/web/rich_media/parsers/ogp.ex b/lib/pleroma/web/rich_media/parsers/ogp.ex
new file mode 100644
index 0000000..b7f2b42
--- /dev/null
+++ b/lib/pleroma/web/rich_media/parsers/ogp.ex
@@ -0,0 +1,10 @@
+# Pleroma: A lightweight social networking server
+# Copyright © 2017-2022 Pleroma Authors <https://pleroma.social/>
+# SPDX-License-Identifier: AGPL-3.0-only
+
+defmodule Pleroma.Web.RichMedia.Parsers.OGP do
+  @deprecated "OGP parser is deprecated. Use TwitterCard instead."
+  def parse(_html, _data) do
+    %{}
+  end
+end
diff --git a/lib/pleroma/web/rich_media/parsers/twitter_card.ex b/lib/pleroma/web/rich_media/parsers/twitter_card.ex
new file mode 100644
index 0000000..cc65372
--- /dev/null
+++ b/lib/pleroma/web/rich_media/parsers/twitter_card.ex
@@ -0,0 +1,15 @@
+# Pleroma: A lightweight social networking server
+# Copyright © 2017-2022 Pleroma Authors <https://pleroma.social/>
+# SPDX-License-Identifier: AGPL-3.0-only
+
+defmodule Pleroma.Web.RichMedia.Parsers.TwitterCard do
+  alias Pleroma.Web.RichMedia.Parsers.MetaTagsParser
+
+  @spec parse(list(), map()) :: map()
+  def parse(html, data) do
+    data
+    |> MetaTagsParser.parse(html, "og", "property")
+    |> MetaTagsParser.parse(html, "twitter", "name")
+    |> MetaTagsParser.parse(html, "twitter", "property")
+  end
+end