diff options
Diffstat (limited to 'lib/pleroma/web/rich_media')
| -rw-r--r-- | lib/pleroma/web/rich_media/helpers.ex | 130 | ||||
| -rw-r--r-- | lib/pleroma/web/rich_media/parser.ex | 169 | ||||
| -rw-r--r-- | lib/pleroma/web/rich_media/parser/ttl.ex | 7 | ||||
| -rw-r--r-- | lib/pleroma/web/rich_media/parser/ttl/aws_signed_url.ex | 50 | ||||
| -rw-r--r-- | lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex | 46 | ||||
| -rw-r--r-- | lib/pleroma/web/rich_media/parsers/o_embed.ex | 29 | ||||
| -rw-r--r-- | lib/pleroma/web/rich_media/parsers/ogp.ex | 10 | ||||
| -rw-r--r-- | lib/pleroma/web/rich_media/parsers/twitter_card.ex | 15 |
8 files changed, 456 insertions, 0 deletions
diff --git a/lib/pleroma/web/rich_media/helpers.ex b/lib/pleroma/web/rich_media/helpers.ex new file mode 100644 index 0000000..0488df3 --- /dev/null +++ b/lib/pleroma/web/rich_media/helpers.ex @@ -0,0 +1,130 @@ +# Pleroma: A lightweight social networking server +# Copyright © 2017-2022 Pleroma Authors <https://pleroma.social/> +# SPDX-License-Identifier: AGPL-3.0-only + +defmodule Pleroma.Web.RichMedia.Helpers do + alias Pleroma.Activity + alias Pleroma.Config + alias Pleroma.HTML + alias Pleroma.Object + alias Pleroma.Web.RichMedia.Parser + + @options [ + pool: :media, + max_body: 2_000_000, + recv_timeout: 2_000 + ] + + @spec validate_page_url(URI.t() | binary()) :: :ok | :error + defp validate_page_url(page_url) when is_binary(page_url) do + validate_tld = Config.get([Pleroma.Formatter, :validate_tld]) + + page_url + |> Linkify.Parser.url?(validate_tld: validate_tld) + |> parse_uri(page_url) + end + + defp validate_page_url(%URI{host: host, scheme: "https", authority: authority}) + when is_binary(authority) do + cond do + host in Config.get([:rich_media, :ignore_hosts], []) -> + :error + + get_tld(host) in Config.get([:rich_media, :ignore_tld], []) -> + :error + + true -> + :ok + end + end + + defp validate_page_url(_), do: :error + + defp parse_uri(true, url) do + url + |> URI.parse() + |> validate_page_url + end + + defp parse_uri(_, _), do: :error + + defp get_tld(host) do + host + |> String.split(".") + |> Enum.reverse() + |> hd + end + + def fetch_data_for_object(object) do + with true <- Config.get([:rich_media, :enabled]), + {:ok, page_url} <- + HTML.extract_first_external_url_from_object(object), + :ok <- validate_page_url(page_url), + {:ok, rich_media} <- Parser.parse(page_url) do + %{page_url: page_url, rich_media: rich_media} + else + _ -> %{} + end + end + + def fetch_data_for_activity(%Activity{data: %{"type" => "Create"}} = activity) do + with true <- Config.get([:rich_media, :enabled]), + %Object{} = object <- Object.normalize(activity, fetch: false) do + fetch_data_for_object(object) + else + _ -> %{} + end + end + + def fetch_data_for_activity(_), do: %{} + + def rich_media_get(url) do + headers = [{"user-agent", Pleroma.Application.user_agent() <> "; Bot"}] + + head_check = + case Pleroma.HTTP.head(url, headers, @options) do + # If the HEAD request didn't reach the server for whatever reason, + # we assume the GET that comes right after won't either + {:error, _} = e -> + e + + {:ok, %Tesla.Env{status: 200, headers: headers}} -> + with :ok <- check_content_type(headers), + :ok <- check_content_length(headers), + do: :ok + + _ -> + :ok + end + + with :ok <- head_check, do: Pleroma.HTTP.get(url, headers, @options) + end + + defp check_content_type(headers) do + case List.keyfind(headers, "content-type", 0) do + {_, content_type} -> + case Plug.Conn.Utils.media_type(content_type) do + {:ok, "text", "html", _} -> :ok + _ -> {:error, {:content_type, content_type}} + end + + _ -> + :ok + end + end + + @max_body @options[:max_body] + defp check_content_length(headers) do + case List.keyfind(headers, "content-length", 0) do + {_, maybe_content_length} -> + case Integer.parse(maybe_content_length) do + {content_length, ""} when content_length <= @max_body -> :ok + {_, ""} -> {:error, :body_too_large} + _ -> :ok + end + + _ -> + :ok + end + end +end diff --git a/lib/pleroma/web/rich_media/parser.ex b/lib/pleroma/web/rich_media/parser.ex new file mode 100644 index 0000000..dbe81ea --- /dev/null +++ b/lib/pleroma/web/rich_media/parser.ex @@ -0,0 +1,169 @@ +# Pleroma: A lightweight social networking server +# Copyright © 2017-2022 Pleroma Authors <https://pleroma.social/> +# SPDX-License-Identifier: AGPL-3.0-only + +defmodule Pleroma.Web.RichMedia.Parser do + require Logger + + @cachex Pleroma.Config.get([:cachex, :provider], Cachex) + + defp parsers do + Pleroma.Config.get([:rich_media, :parsers]) + end + + def parse(nil), do: {:error, "No URL provided"} + + if Pleroma.Config.get(:env) == :test do + @spec parse(String.t()) :: {:ok, map()} | {:error, any()} + def parse(url), do: parse_url(url) + else + @spec parse(String.t()) :: {:ok, map()} | {:error, any()} + def parse(url) do + with {:ok, data} <- get_cached_or_parse(url), + {:ok, _} <- set_ttl_based_on_image(data, url) do + {:ok, data} + end + end + + defp get_cached_or_parse(url) do + case @cachex.fetch(:rich_media_cache, url, fn -> + case parse_url(url) do + {:ok, _} = res -> + {:commit, res} + + {:error, reason} = e -> + # Unfortunately we have to log errors here, instead of doing that + # along with ttl setting at the bottom. Otherwise we can get log spam + # if more than one process was waiting for the rich media card + # while it was generated. Ideally we would set ttl here as well, + # so we don't override it number_of_waiters_on_generation + # times, but one, obviously, can't set ttl for not-yet-created entry + # and Cachex doesn't support returning ttl from the fetch callback. + log_error(url, reason) + {:commit, e} + end + end) do + {action, res} when action in [:commit, :ok] -> + case res do + {:ok, _data} = res -> + res + + {:error, reason} = e -> + if action == :commit, do: set_error_ttl(url, reason) + e + end + + {:error, e} -> + {:error, {:cachex_error, e}} + end + end + + defp set_error_ttl(_url, :body_too_large), do: :ok + defp set_error_ttl(_url, {:content_type, _}), do: :ok + + # The TTL is not set for the errors above, since they are unlikely to change + # with time + + defp set_error_ttl(url, _reason) do + ttl = Pleroma.Config.get([:rich_media, :failure_backoff], 60_000) + @cachex.expire(:rich_media_cache, url, ttl) + :ok + end + + defp log_error(url, {:invalid_metadata, data}) do + Logger.debug(fn -> "Incomplete or invalid metadata for #{url}: #{inspect(data)}" end) + end + + defp log_error(url, reason) do + Logger.warn(fn -> "Rich media error for #{url}: #{inspect(reason)}" end) + end + end + + @doc """ + Set the rich media cache based on the expiration time of image. + + Adopt behaviour `Pleroma.Web.RichMedia.Parser.TTL` + + ## Example + + defmodule MyModule do + @behaviour Pleroma.Web.RichMedia.Parser.TTL + def ttl(data, url) do + image_url = Map.get(data, :image) + # do some parsing in the url and get the ttl of the image + # and return ttl is unix time + parse_ttl_from_url(image_url) + end + end + + Define the module in the config + + config :pleroma, :rich_media, + ttl_setters: [MyModule] + """ + @spec set_ttl_based_on_image(map(), String.t()) :: + {:ok, Integer.t() | :noop} | {:error, :no_key} + def set_ttl_based_on_image(data, url) do + case get_ttl_from_image(data, url) do + {:ok, ttl} when is_number(ttl) -> + ttl = ttl * 1000 + + case @cachex.expire_at(:rich_media_cache, url, ttl) do + {:ok, true} -> {:ok, ttl} + {:ok, false} -> {:error, :no_key} + end + + _ -> + {:ok, :noop} + end + end + + defp get_ttl_from_image(data, url) do + [:rich_media, :ttl_setters] + |> Pleroma.Config.get() + |> Enum.reduce({:ok, nil}, fn + module, {:ok, _ttl} -> + module.ttl(data, url) + + _, error -> + error + end) + end + + def parse_url(url) do + with {:ok, %Tesla.Env{body: html}} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url), + {:ok, html} <- Floki.parse_document(html) do + html + |> maybe_parse() + |> Map.put("url", url) + |> clean_parsed_data() + |> check_parsed_data() + end + end + + defp maybe_parse(html) do + Enum.reduce_while(parsers(), %{}, fn parser, acc -> + case parser.parse(html, acc) do + data when data != %{} -> {:halt, data} + _ -> {:cont, acc} + end + end) + end + + defp check_parsed_data(%{"title" => title} = data) + when is_binary(title) and title != "" do + {:ok, data} + end + + defp check_parsed_data(data) do + {:error, {:invalid_metadata, data}} + end + + defp clean_parsed_data(data) do + data + |> Enum.reject(fn {key, val} -> + not match?({:ok, _}, Jason.encode(%{key => val})) + end) + |> Map.new() + end +end diff --git a/lib/pleroma/web/rich_media/parser/ttl.ex b/lib/pleroma/web/rich_media/parser/ttl.ex new file mode 100644 index 0000000..59d7f87 --- /dev/null +++ b/lib/pleroma/web/rich_media/parser/ttl.ex @@ -0,0 +1,7 @@ +# Pleroma: A lightweight social networking server +# Copyright © 2017-2022 Pleroma Authors <https://pleroma.social/> +# SPDX-License-Identifier: AGPL-3.0-only + +defmodule Pleroma.Web.RichMedia.Parser.TTL do + @callback ttl(Map.t(), String.t()) :: Integer.t() | nil +end diff --git a/lib/pleroma/web/rich_media/parser/ttl/aws_signed_url.ex b/lib/pleroma/web/rich_media/parser/ttl/aws_signed_url.ex new file mode 100644 index 0000000..fa41c16 --- /dev/null +++ b/lib/pleroma/web/rich_media/parser/ttl/aws_signed_url.ex @@ -0,0 +1,50 @@ +# Pleroma: A lightweight social networking server +# Copyright © 2017-2022 Pleroma Authors <https://pleroma.social/> +# SPDX-License-Identifier: AGPL-3.0-only + +defmodule Pleroma.Web.RichMedia.Parser.TTL.AwsSignedUrl do + @behaviour Pleroma.Web.RichMedia.Parser.TTL + + @impl true + def ttl(data, _url) do + image = Map.get(data, :image) + + if is_aws_signed_url(image) do + image + |> parse_query_params() + |> format_query_params() + |> get_expiration_timestamp() + else + {:error, "Not aws signed url #{inspect(image)}"} + end + end + + defp is_aws_signed_url(image) when is_binary(image) and image != "" do + %URI{host: host, query: query} = URI.parse(image) + + String.contains?(host, "amazonaws.com") and String.contains?(query, "X-Amz-Expires") + end + + defp is_aws_signed_url(_), do: nil + + defp parse_query_params(image) do + %URI{query: query} = URI.parse(image) + query + end + + defp format_query_params(query) do + query + |> String.split(~r/&|=/) + |> Enum.chunk_every(2) + |> Map.new(fn [k, v] -> {k, v} end) + end + + defp get_expiration_timestamp(params) when is_map(params) do + {:ok, date} = + params + |> Map.get("X-Amz-Date") + |> Timex.parse("{ISO:Basic:Z}") + + {:ok, Timex.to_unix(date) + String.to_integer(Map.get(params, "X-Amz-Expires"))} + end +end diff --git a/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex b/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex new file mode 100644 index 0000000..320a5f5 --- /dev/null +++ b/lib/pleroma/web/rich_media/parsers/meta_tags_parser.ex @@ -0,0 +1,46 @@ +# Pleroma: A lightweight social networking server +# Copyright © 2017-2022 Pleroma Authors <https://pleroma.social/> +# SPDX-License-Identifier: AGPL-3.0-only + +defmodule Pleroma.Web.RichMedia.Parsers.MetaTagsParser do + def parse(data, html, prefix, key_name, value_name \\ "content") do + html + |> get_elements(key_name, prefix) + |> Enum.reduce(data, fn el, acc -> + attributes = normalize_attributes(el, prefix, key_name, value_name) + + Map.merge(acc, attributes) + end) + |> maybe_put_title(html) + end + + defp get_elements(html, key_name, prefix) do + html |> Floki.find("meta[#{key_name}^='#{prefix}:']") + end + + defp normalize_attributes(html_node, prefix, key_name, value_name) do + {_tag, attributes, _children} = html_node + + data = + Map.new(attributes, fn {name, value} -> + {name, String.trim_leading(value, "#{prefix}:")} + end) + + %{data[key_name] => data[value_name]} + end + + defp maybe_put_title(%{"title" => _} = meta, _), do: meta + + defp maybe_put_title(meta, html) when meta != %{} do + case get_page_title(html) do + "" -> meta + title -> Map.put_new(meta, "title", title) + end + end + + defp maybe_put_title(meta, _), do: meta + + defp get_page_title(html) do + Floki.find(html, "html head title") |> List.first() |> Floki.text() + end +end diff --git a/lib/pleroma/web/rich_media/parsers/o_embed.ex b/lib/pleroma/web/rich_media/parsers/o_embed.ex new file mode 100644 index 0000000..0f30317 --- /dev/null +++ b/lib/pleroma/web/rich_media/parsers/o_embed.ex @@ -0,0 +1,29 @@ +# Pleroma: A lightweight social networking server +# Copyright © 2017-2022 Pleroma Authors <https://pleroma.social/> +# SPDX-License-Identifier: AGPL-3.0-only + +defmodule Pleroma.Web.RichMedia.Parsers.OEmbed do + def parse(html, _data) do + with elements = [_ | _] <- get_discovery_data(html), + oembed_url when is_binary(oembed_url) <- get_oembed_url(elements), + {:ok, oembed_data = %{"html" => html}} <- get_oembed_data(oembed_url) do + %{oembed_data | "html" => Pleroma.HTML.filter_tags(html)} + else + _e -> %{} + end + end + + defp get_discovery_data(html) do + html |> Floki.find("link[type='application/json+oembed']") + end + + defp get_oembed_url([{"link", attributes, _children} | _]) do + Enum.find_value(attributes, fn {k, v} -> if k == "href", do: v end) + end + + defp get_oembed_data(url) do + with {:ok, %Tesla.Env{body: json}} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url) do + Jason.decode(json) + end + end +end diff --git a/lib/pleroma/web/rich_media/parsers/ogp.ex b/lib/pleroma/web/rich_media/parsers/ogp.ex new file mode 100644 index 0000000..b7f2b42 --- /dev/null +++ b/lib/pleroma/web/rich_media/parsers/ogp.ex @@ -0,0 +1,10 @@ +# Pleroma: A lightweight social networking server +# Copyright © 2017-2022 Pleroma Authors <https://pleroma.social/> +# SPDX-License-Identifier: AGPL-3.0-only + +defmodule Pleroma.Web.RichMedia.Parsers.OGP do + @deprecated "OGP parser is deprecated. Use TwitterCard instead." + def parse(_html, _data) do + %{} + end +end diff --git a/lib/pleroma/web/rich_media/parsers/twitter_card.ex b/lib/pleroma/web/rich_media/parsers/twitter_card.ex new file mode 100644 index 0000000..cc65372 --- /dev/null +++ b/lib/pleroma/web/rich_media/parsers/twitter_card.ex @@ -0,0 +1,15 @@ +# Pleroma: A lightweight social networking server +# Copyright © 2017-2022 Pleroma Authors <https://pleroma.social/> +# SPDX-License-Identifier: AGPL-3.0-only + +defmodule Pleroma.Web.RichMedia.Parsers.TwitterCard do + alias Pleroma.Web.RichMedia.Parsers.MetaTagsParser + + @spec parse(list(), map()) :: map() + def parse(html, data) do + data + |> MetaTagsParser.parse(html, "og", "property") + |> MetaTagsParser.parse(html, "twitter", "name") + |> MetaTagsParser.parse(html, "twitter", "property") + end +end |
