Source code for granary.reddit

# coding=utf-8
"""Reddit source class.

Reddit API docs:
https://github.com/reddit-archive/reddit/wiki/API
https://www.reddit.com/dev/api
https://www.reddit.com/prefs/apps

PRAW API docs:
https://praw.readthedocs.io/
"""
import urllib.parse, urllib.request
import threading

from cachetools import cachedmethod, TTLCache
from oauth_dropins import reddit
from oauth_dropins.webutil import util
import praw
from prawcore.exceptions import NotFound

from . import source

USER_CACHE_TIME = 5 * 60  # 5 minute expiration, in seconds
user_cache = TTLCache(1000, USER_CACHE_TIME)
user_cache_lock = threading.RLock()


[docs]class Reddit(source.Source):
  """Reddit source class. See file docstring and Source class for details."""

  DOMAIN = 'reddit.com'
  BASE_URL = 'https://reddit.com'
  NAME = 'Reddit'
  OPTIMIZED_COMMENTS = True

[docs]  def __init__(self, refresh_token):
    self.refresh_token = refresh_token
    self.reddit_api = None

  def get_reddit_api(self):
    if not self.reddit_api:
      self.reddit_api = praw.Reddit(client_id=reddit.REDDIT_APP_KEY,
                                    client_secret=reddit.REDDIT_APP_SECRET,
                                    refresh_token=self.refresh_token,
                                    user_agent='granary (https://granary.io/)')
      self.reddit_api.read_only = True

    return self.reddit_api

[docs]  @classmethod
  def post_id(self, url):
    """Guesses the post id of the given URL.

    Args:
      url: string

    Returns:
      string, or None
    """
    path_parts = urllib.parse.urlparse(url).path.rstrip('/').split('/')
    if len(path_parts) >= 2:
      return path_parts[-2]

[docs]  @cachedmethod(lambda self: user_cache, lock=lambda self: user_cache_lock,
                key=lambda user: getattr(user, 'name', None))
  def praw_to_actor(self, praw_user):
    """Converts a PRAW Redditor to an actor.

    Makes external calls to fetch data from the Reddit API.

    https://praw.readthedocs.io/en/latest/code_overview/models/redditor.html

    Caches fetched user data for 5m to avoid repeating user profile API requests
    when fetching multiple comments or posts from the same author. Background:
    https://github.com/snarfed/bridgy/issues/1021

    Ideally this would be part of PRAW, but they seem uninterested:
    https://github.com/praw-dev/praw/issues/131
    https://github.com/praw-dev/praw/issues/1140

    Args:
      user: PRAW Redditor object

    Returns:
      an ActivityStreams actor dict, ready to be JSON-encoded
    """
    try:
      user = reddit.praw_to_user(praw_user)
    except NotFound:
      return {}

    return self.user_to_actor(user)

[docs]  def user_to_actor(self, user):
    """Converts a dict user to an actor.

    Args:
      user: JSON user

    Returns:
      an ActivityStreams actor dict, ready to be JSON-encoded
    """
    username = user.get('name')
    if not username:
      return {}

    # trying my best to grab all the urls from the profile description
    urls = [f'{self.BASE_URL}/user/{username}/']
    description = None

    subreddit = user.get('subreddit')
    if subreddit:
      url = subreddit.get('url')
      if url:
        urls.append(self.BASE_URL + url)
      description = subreddit.get('description')
      urls += util.trim_nulls(util.extract_links(description))

    image = user.get('icon_img')

    return util.trim_nulls({
      'objectType': 'person',
      'displayName': username,
      'image': {'url': image},
      'id': self.tag_uri(username),
      # numeric_id is our own custom field that always has the source's numeric
      # user id, if available.
      'numeric_id': user.get('id'),
      'published': util.maybe_timestamp_to_iso8601(user.get('created_utc')),
      'url': urls[0],
      'urls': [{'value': u} for u in urls] if len(urls) > 1 else None,
      'username': username,
      'description': description,
    })

[docs]  def praw_to_object(self, thing, type):
    """Converts a PRAW object to an AS1 object.

    Currently only returns public content.

    Note that this will make external API calls to lazily load some attributes.

    Args:
      thing: a PRAW object, Submission or Comment
      type: string to denote whether to get submission or comment content

    Returns:
      an ActivityStreams object dict, ready to be JSON-encoded
      """
    id = getattr(thing, 'id', None)
    if not id:
      return {}

    published = util.maybe_timestamp_to_iso8601(getattr(thing, 'created_utc', None))
    obj = {
      'id': self.tag_uri(id),
      'url': self.BASE_URL + thing.permalink,
      'published': published,
      'to': [{
        'objectType': 'group',
        'alias': '@public',
      }],
    }

    user = getattr(thing, 'author', None)
    if user:
      obj['author'] = self.praw_to_actor(user)

    if type == 'submission':
      content = getattr(thing, 'selftext', None)
      obj.update({
        'displayName': getattr(thing, 'title', None),
        'content': content,
        'objectType': 'note',
        'tags': [{
          'objectType': 'article',
          'url': t,
          'displayName': t,
        } for t in util.extract_links(content)],
      })

      url = getattr(thing, 'url', None)
      if url:
        obj.update({
          'objectType': 'bookmark',
          'targetUrl': url,
        })

    elif type == 'comment':
      obj.update({
        'content': getattr(thing, 'body_html', None),
        'objectType': 'comment',
      })
      reply_to = thing.parent()
      if reply_to:
        obj['inReplyTo'] = [{
          'id': self.tag_uri(getattr(reply_to, 'id', None)),
          'url': self.BASE_URL + getattr(reply_to, 'permalink', None),
        }]

    return self.postprocess_object(obj)

[docs]  def praw_to_activity(self, thing, type):
    """Converts a PRAW submission or comment to an activity.

    Note that this will make external API calls to lazily load some attributes.

    https://praw.readthedocs.io/en/latest/code_overview/models/submission.html
    https://praw.readthedocs.io/en/latest/code_overview/models/comment.html

    Args:
      thing: a PRAW object, Submission or Comment
      type: string to denote whether to get submission or comment content

    Returns:
      an ActivityStreams activity dict, ready to be JSON-encoded
    """
    obj = self.praw_to_object(thing, type)
    if not obj:
      return {}

    activity = {
      'verb': 'post',
      'id': obj['id'],
      'url': self.BASE_URL + getattr(thing, 'permalink', None),
      'actor': obj.get('author'),
      'object': obj,
    }
    return self.postprocess_activity(activity)

  def _fetch_replies(self, r, activities):
    """Fetches and injects comments into a list of activities, in place.

    limitations: Only includes top level comments
    Args:
      r: PRAW API object for querying submissions in activities
      activities: list of activity dicts
    """
    for activity in activities:
      subm = r.submission(id=util.parse_tag_uri(activity.get('id'))[1])

      # for v0 we will use just the top level comments because threading is hard.
      # feature request: https://github.com/snarfed/bridgy/issues/1014
      subm.comments.replace_more()
      replies = [self.praw_to_activity(top_level_comment, 'comment')
                 for top_level_comment in subm.comments]
      items = [r.get('object') for r in replies]
      activity['object']['replies'] = {
        'items': items,
        'totalItems': len(items),
      }

[docs]  def get_activities_response(self, user_id=None, group_id=None, app_id=None,
                              activity_id=None, start_index=0, count=0,
                              etag=None, min_id=None, cache=None,
                              fetch_replies=False, fetch_likes=False,
                              fetch_shares=False, fetch_events=False,
                              fetch_mentions=False, search_query=None, **kwargs):
    """Fetches submissions and ActivityStreams activities.

    Currently only implements activity_id, search_query and fetch_replies.
    """
    activities = []
    r = self.get_reddit_api()

    if activity_id:
      subm = r.submission(id=activity_id)
      activities.append(self.praw_to_activity(subm, 'submission'))
    elif search_query:
      sr = r.subreddit('all')
      subms = sr.search(search_query, sort='new')
      activities.extend([self.praw_to_activity(subm, 'submission') for subm in subms])

    if fetch_replies:
      self._fetch_replies(r, activities)

    return self.make_activities_base_response(activities)

[docs]  def get_actor(self, user_id=None):
    """PLACEHOLDER. Returns an empty dict.

    Only here because the granary.io API needs this to emit Atom data.

    TODO: implement.
    """
    return {}

[docs]  def get_comment(self, comment_id, activity_id=None, activity_author_id=None,
                  activity=None):
    """Returns an ActivityStreams comment object.

    Args:
      comment_id: string comment id
      activity_id: string activity id, Ignored
      activity_author_id: string activity author id. Ignored.
      activity: activity object, Ignored
    """
    r = self.get_reddit_api()
    return self.praw_to_object(r.comment(id=comment_id), 'comment')

[docs]  def user_url(self, username):
    """Returns the Reddit URL for a given user."""
    return 'https://%s/user/%s' % (self.DOMAIN, username)