ufal.udpipe

Bindings to UDPipe library

mpl-2.0 9 个版本

Milan Straka <straka@ufal.mff.cuni.cz>

安装

pip install ufal.udpipe

poetry add ufal.udpipe

pipenv install ufal.udpipe

conda install ufal.udpipe

描述

ufal.udpipe

The ufal.udpipe is a Python binding to UDPipe library http://ufal.mff.cuni.cz/udpipe.

The bindings is a straightforward conversion of the C++ bindings API. In Python 2, strings can be both unicode and UTF-8 encoded str, and the library always produces unicode. In Python 3, strings must be only str.

Wrapped C++ API

The C++ API being wrapped follows. For a API reference of the original C++ API, see http://ufal.mff.cuni.cz/udpipe/api-reference.

Helper Structures

typedef vector<int> Children;

typedef vector<uint8_t> Bytes;

typedef vector<string> Comments;

class ProcessingError {
public:
  bool occurred();
  string message;
};

class Token {
 public:
  string form;
  string misc;

  Token(const string& form = string(), const string& misc = string());

  // CoNLL-U defined SpaceAfter=No feature
  bool getSpaceAfter() const;
  void setSpaceAfter(bool space_after);

  // UDPipe-specific all-spaces-preserving SpacesBefore and SpacesAfter features
  string getSpacesBefore() const;
  void setSpacesBefore(const string& spaces_before);
  string getSpacesAfter() const;
  void setSpacesAfter(const string& spaces_after);
  string getSpacesInToken() const;
  void setSpacesInToken(const string& spaces_in_token);

  // UDPipe-specific TokenRange feature
  bool getTokenRange() const;
  size_t getTokenRangeStart() const;
  size_t getTokenRangeEnd() const;
  void setTokenRange(size_t start, size_t end);
};

class Word : public Token {
 public:
  // form and misc are inherited from token
  int id;         // 0 is root, >0 is sentence word, <0 is undefined
  string lemma;   // lemma
  string upostag; // universal part-of-speech tag
  string xpostag; // language-specific part-of-speech tag
  string feats;   // list of morphological features
  int head;       // head, 0 is root, <0 is undefined
  string deprel;  // dependency relation to the head
  string deps;    // secondary dependencies

  Children children;

  Word(int id = -1, const string& form = string());
};
typedef vector<Word> Words;

class MultiwordToken : public Token {
 public:
  // form and misc are inherited from token
  int idFirst, idLast;

  MultiwordToken(int id_first = -1, int id_last = -1, const string& form = string(), const string& misc = string());
};
typedef vector<MultiwordToken> MultiwordTokens;

class EmptyNode {
 public:
  int id;          // 0 is root, >0 is sentence word, <0 is undefined
  int index;       // index for the current id, should be numbered from 1, 0=undefined
  string form;     // form
  string lemma;    // lemma
  string upostag;  // universal part-of-speech tag
  string xpostag;  // language-specific part-of-speech tag
  string feats;    // list of morphological features
  string deps;     // secondary dependencies
  string misc;     // miscellaneous information

  EmptyNode(int id = -1, int index = 0) : id(id), index(index) {}
};
typedef vector<empty_node> EmptyNodes;

class Sentence {
 public:
  Sentence();

  Words words;
  MultiwordTokens multiwordTokens;
  EmptyNodes emptyNodes;
  Comments comments;
  static const string rootForm;

  // Basic sentence modifications
  bool empty();
  void clear();
  virtual Word& addWord(const char* form);
  void setHead(int id, int head, const string& deprel);
  void unlinkAllWords();

  // CoNLL-U defined comments
  bool getNewDoc() const;
  string getNewDocId() const;
  void setNewDoc(bool new_doc, const string& id = string());
  bool getNewPar() const;
  string getNewParId() const;
  void setNewPar(bool new_par, const string& id = string());

  string getSentId() const;
  void setSentId(const string& id);
  string getText() const;
  void setText(const string& id);
};
typedef vector<Sentence> Sentences;

Main Classes

class InputFormat {
 public:
  virtual void resetDocument(const string& id = string());
  virtual void setText(const char* text);
  virtual bool nextSentence(Sentence& s, ProcessingError* error = nullptr);

  static InputFormat* newInputFormat(const string& name);
  static InputFormat* newConlluInputFormat(const string& id = string());
  static InputFormat* newGenericTokenizerInputFormat(const string& id = string());
  static InputFormat* newHorizontalInputFormat(const string& id = string());
  static InputFormat* newVerticalInputFormat(const string& id = string());

  static InputFormat* newPresegmentedTokenizer(InputFormat tokenizer);

  static const string CONLLU_V1;
  static const string CONLLU_V2;
  static const string GENERIC_TOKENIZER_NORMALIZED_SPACES;
  static const string GENERIC_TOKENIZER_PRESEGMENTED;
  static const string GENERIC_TOKENIZER_RANGES;
};

class OutputFormat {
 public:
  virtual string writeSentence(const Sentence& s);
  virtual string finishDocument();

  static OutputFormat* newOutputFormat(const string& name);
  static OutputFormat* newConlluOutputFormat(const string& options = string());
  static OutputFormat* newEpeOutputFormat(const string& options = string());
  static OutputFormat* newMatxinOutputFormat(const string& options = string());
  static OutputFormat* newHorizontalOutputFormat(const string& options = string());
  static OutputFormat* newPlaintextOutputFormat(const string& options = string());
  static OutputFormat* newVerticalOutputFormat(const string& options = string());

  static const string CONLLU_V1;
  static const string CONLLU_V2;
  static const string HORIZONTAL_PARAGRAPHS;
  static const string PLAINTEXT_NORMALIZED_SPACES;
  static const string VERTICAL_PARAGRAPHS;
};

class Model {
 public:
  static Model* load(const char* fname);

  virtual InputFormat* newTokenizer(const string& options) const;
  virtual bool tag(Sentence& s, const string& options, ProcessingError* error = nullptr) const;
  virtual bool parse(Sentence& s, const string& options, ProcessingError* error) const;

  static const string DEFAULT;
  static const string TOKENIZER_PRESEGMENTED;
};

class Pipeline {
 public:
  Pipeline(const Model* m, const string& input, const string& tagger, const string& parser, const string& output);

  void setModel(const Model* m);
  void setInput(const string& input);
  void setTagger(const string& tagger);
  void setParser(const string& parser);
  void setOutput(const string& output);

  void setImmediate(bool immediate);
  void setDocumentId(const string& document_id);

  string process(const string& data, ProcessingError* error = nullptr) const;

  static const string DEFAULT;
  static const string NONE;
};

class Trainer {
 public:

  static Bytes* train(const string& method, const Sentences& train, const Sentences& heldout,
                      const string& tokenizer, const string& tagger, const string& parser,
                      ProcessingError* error = nullptr);

  static const string DEFAULT;
  static const string NONE;
};

class Evaluator {
 public:
  Evaluator(const Model* m, const string& tokenizer, const string& tagger, const string& parser);

  void setModel(const Model* m);
  void setTokenizer(const string& tokenizer);
  void setTagger(const string& tagger);
  void setParser(const string& parser);

  string evaluate(const string& data, ProcessingError* error = nullptr) const;

  static const string DEFAULT;
  static const string NONE;
};

class Version {
 public:
  unsigned major;
  unsigned minor;
  unsigned patch;
  string prerelease;

  // Returns current version.
  static version current();
};

Examples

run_udpipe

Simple pipeline loading data (tokenizing on request), tagging, parsing and writing to specified output format::

import sys

from ufal.udpipe import Model, Pipeline, ProcessingError # pylint: disable=no-name-in-module

In Python2, wrap sys.stdin and sys.stdout to work with unicode.

if sys.version_info[0] < 3: import codecs import locale encoding = locale.getpreferredencoding() sys.stdin = codecs.getreader(encoding)(sys.stdin) sys.stdout = codecs.getwriter(encoding)(sys.stdout)

if len(sys.argv) < 4: sys.stderr.write('Usage: %s input_format(tokenize|conllu|horizontal|vertical) output_format(conllu) model_file\n' % sys.argv[0]) sys.exit(1)

sys.stderr.write('Loading model: ') model = Model.load(sys.argv[3]) if not model: sys.stderr.write("Cannot load model from file '%s'\n" % sys.argv[3]) sys.exit(1) sys.stderr.write('done\n')

pipeline = Pipeline(model, sys.argv[1], Pipeline.DEFAULT, Pipeline.DEFAULT, sys.argv[2]) error = ProcessingError()

Read whole input

text = ''.join(sys.stdin.readlines())

Process data

processed = pipeline.process(text, error) if error.occurred(): sys.stderr.write("An error occurred when running run_udpipe: ") sys.stderr.write(error.message) sys.stderr.write("\n") sys.exit(1) sys.stdout.write(processed)

AUTHORS

Milan Straka straka@ufal.mff.cuni.cz

COPYRIGHT AND LICENCE

This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

分类

Development Status :: 5 - Production/Stable Programming Language :: C++ Programming Language :: Python Programming Language :: Python :: 3 Topic :: Software Development :: Libraries

版本列表

1.4.0.1 2025-11-20

1.3.1.1 2023-11-16

1.3.0.1 2023-02-16

1.2.0.3 2023-01-26

1.2.0.2 2019-05-02

1.2.0.1 2017-08-02

1.1.0.2 2017-04-15

1.1.0.1 2017-03-29

1.0.0.1 2016-05-27