You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

223 line
5.6 KiB

  1. /* C implementation of performance sensitive functions. */
  2. #define PY_SSIZE_T_CLEAN
  3. #include <Python.h>
  4. #include <stdint.h> /* uint8_t, uint32_t, uint64_t */
  5. #if __ARM_NEON
  6. #include <arm_neon.h>
  7. #elif __SSE2__
  8. #include <emmintrin.h>
  9. #endif
  10. static const Py_ssize_t MASK_LEN = 4;
  11. /* Similar to PyBytes_AsStringAndSize, but accepts more types */
  12. static int
  13. _PyBytesLike_AsStringAndSize(PyObject *obj, PyObject **tmp, char **buffer, Py_ssize_t *length)
  14. {
  15. // This supports bytes, bytearrays, and memoryview objects,
  16. // which are common data structures for handling byte streams.
  17. // If *tmp isn't NULL, the caller gets a new reference.
  18. if (PyBytes_Check(obj))
  19. {
  20. *tmp = NULL;
  21. *buffer = PyBytes_AS_STRING(obj);
  22. *length = PyBytes_GET_SIZE(obj);
  23. }
  24. else if (PyByteArray_Check(obj))
  25. {
  26. *tmp = NULL;
  27. *buffer = PyByteArray_AS_STRING(obj);
  28. *length = PyByteArray_GET_SIZE(obj);
  29. }
  30. else if (PyMemoryView_Check(obj))
  31. {
  32. *tmp = PyMemoryView_GetContiguous(obj, PyBUF_READ, 'C');
  33. if (*tmp == NULL)
  34. {
  35. return -1;
  36. }
  37. Py_buffer *mv_buf;
  38. mv_buf = PyMemoryView_GET_BUFFER(*tmp);
  39. *buffer = mv_buf->buf;
  40. *length = mv_buf->len;
  41. }
  42. else
  43. {
  44. PyErr_Format(
  45. PyExc_TypeError,
  46. "expected a bytes-like object, %.200s found",
  47. Py_TYPE(obj)->tp_name);
  48. return -1;
  49. }
  50. return 0;
  51. }
  52. /* C implementation of websockets.utils.apply_mask */
  53. static PyObject *
  54. apply_mask(PyObject *self, PyObject *args, PyObject *kwds)
  55. {
  56. // In order to support various bytes-like types, accept any Python object.
  57. static char *kwlist[] = {"data", "mask", NULL};
  58. PyObject *input_obj;
  59. PyObject *mask_obj;
  60. // A pointer to a char * + length will be extracted from the data and mask
  61. // arguments, possibly via a Py_buffer.
  62. PyObject *input_tmp = NULL;
  63. char *input;
  64. Py_ssize_t input_len;
  65. PyObject *mask_tmp = NULL;
  66. char *mask;
  67. Py_ssize_t mask_len;
  68. // Initialize a PyBytesObject then get a pointer to the underlying char *
  69. // in order to avoid an extra memory copy in PyBytes_FromStringAndSize.
  70. PyObject *result = NULL;
  71. char *output;
  72. // Other variables.
  73. Py_ssize_t i = 0;
  74. // Parse inputs.
  75. if (!PyArg_ParseTupleAndKeywords(
  76. args, kwds, "OO", kwlist, &input_obj, &mask_obj))
  77. {
  78. goto exit;
  79. }
  80. if (_PyBytesLike_AsStringAndSize(input_obj, &input_tmp, &input, &input_len) == -1)
  81. {
  82. goto exit;
  83. }
  84. if (_PyBytesLike_AsStringAndSize(mask_obj, &mask_tmp, &mask, &mask_len) == -1)
  85. {
  86. goto exit;
  87. }
  88. if (mask_len != MASK_LEN)
  89. {
  90. PyErr_SetString(PyExc_ValueError, "mask must contain 4 bytes");
  91. goto exit;
  92. }
  93. // Create output.
  94. result = PyBytes_FromStringAndSize(NULL, input_len);
  95. if (result == NULL)
  96. {
  97. goto exit;
  98. }
  99. // Since we just created result, we don't need error checks.
  100. output = PyBytes_AS_STRING(result);
  101. // Perform the masking operation.
  102. // Apparently GCC cannot figure out the following optimizations by itself.
  103. // We need a new scope for MSVC 2010 (non C99 friendly)
  104. {
  105. #if __ARM_NEON
  106. // With NEON support, XOR by blocks of 16 bytes = 128 bits.
  107. Py_ssize_t input_len_128 = input_len & ~15;
  108. uint8x16_t mask_128 = vreinterpretq_u8_u32(vdupq_n_u32(*(uint32_t *)mask));
  109. for (; i < input_len_128; i += 16)
  110. {
  111. uint8x16_t in_128 = vld1q_u8((uint8_t *)(input + i));
  112. uint8x16_t out_128 = veorq_u8(in_128, mask_128);
  113. vst1q_u8((uint8_t *)(output + i), out_128);
  114. }
  115. #elif __SSE2__
  116. // With SSE2 support, XOR by blocks of 16 bytes = 128 bits.
  117. // Since we cannot control the 16-bytes alignment of input and output
  118. // buffers, we rely on loadu/storeu rather than load/store.
  119. Py_ssize_t input_len_128 = input_len & ~15;
  120. __m128i mask_128 = _mm_set1_epi32(*(uint32_t *)mask);
  121. for (; i < input_len_128; i += 16)
  122. {
  123. __m128i in_128 = _mm_loadu_si128((__m128i *)(input + i));
  124. __m128i out_128 = _mm_xor_si128(in_128, mask_128);
  125. _mm_storeu_si128((__m128i *)(output + i), out_128);
  126. }
  127. #else
  128. // Without SSE2 support, XOR by blocks of 8 bytes = 64 bits.
  129. // We assume the memory allocator aligns everything on 8 bytes boundaries.
  130. Py_ssize_t input_len_64 = input_len & ~7;
  131. uint32_t mask_32 = *(uint32_t *)mask;
  132. uint64_t mask_64 = ((uint64_t)mask_32 << 32) | (uint64_t)mask_32;
  133. for (; i < input_len_64; i += 8)
  134. {
  135. *(uint64_t *)(output + i) = *(uint64_t *)(input + i) ^ mask_64;
  136. }
  137. #endif
  138. }
  139. // XOR the remainder of the input byte by byte.
  140. for (; i < input_len; i++)
  141. {
  142. output[i] = input[i] ^ mask[i & (MASK_LEN - 1)];
  143. }
  144. exit:
  145. Py_XDECREF(input_tmp);
  146. Py_XDECREF(mask_tmp);
  147. return result;
  148. }
  149. static PyMethodDef speedups_methods[] = {
  150. {
  151. "apply_mask",
  152. (PyCFunction)apply_mask,
  153. METH_VARARGS | METH_KEYWORDS,
  154. "Apply masking to the data of a WebSocket message.",
  155. },
  156. {NULL, NULL, 0, NULL}, /* Sentinel */
  157. };
  158. static struct PyModuleDef speedups_module = {
  159. PyModuleDef_HEAD_INIT,
  160. "websocket.speedups", /* m_name */
  161. "C implementation of performance sensitive functions.",
  162. /* m_doc */
  163. -1, /* m_size */
  164. speedups_methods, /* m_methods */
  165. NULL,
  166. NULL,
  167. NULL,
  168. NULL
  169. };
  170. PyMODINIT_FUNC
  171. PyInit_speedups(void)
  172. {
  173. return PyModule_Create(&speedups_module);
  174. }