Package nltk_lite :: Package contrib :: Package dependency :: Module deptree
[hide private]
[frames] | no frames]

Source Code for Module nltk_lite.contrib.dependency.deptree

  1  # Natural Language Toolkit: Dependency Trees 
  2  # 
  3  # Author: Ewan Klein <ewan@inf.ed.ac.uk> 
  4  # 
  5  # URL: <http://nltk.sf.net> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  """ 
  9  Tools for reading and writing dependency trees. 
 10  The input is assumed to be in U{Malt-TAB<http://w3.msi.vxu.se/~nivre/research/MaltXML.html>} format. 
 11   
 12  Currently only reads the first tree in a file. 
 13  """ 
 14   
 15  from nltk_lite.parse import Tree 
 16  from pprint import pformat 
 17  import re 
 18   
 19   
20 -class DepGraph(object):
21 """ 22 A container for the nodes and labelled edges of a dependency structure. 23 """
24 - def __init__(self):
25 """ 26 We place a dummy 'top' node in the first position 27 in the nodelist, since the root node is often assigned '0' 28 as its head. This also means that the indexing of the nodelist 29 corresponds directly to the Malt-TAB format, which starts at 1. 30 """ 31 top = {'word':None, 'deps':[], 'rel': 'TOP'} 32 self.nodelist = [top] 33 self.root = None 34 self.stream = None
35
36 - def __str__(self):
37 return pformat(self.nodelist)
38
39 - def load(self, file):
40 """ 41 @param file: a file in Malt-TAB format 42 """ 43 input = open(file).read() 44 return self.read(input)
45
46 - def _normalize(self, line):
47 """ 48 Deal with lines in which spaces are used rather than tabs. 49 """ 50 import re 51 SPC = re.compile(' +') 52 return re.sub(SPC, '\t', line)
53
54 - def read(self, input):
55 """ 56 @param input: a string in Malt-TAB format 57 """ 58 lines = input.split('\n') 59 count = 1 60 temp = [] 61 62 for line in lines: 63 line = self._normalize(line) 64 node = {} 65 try: 66 (word, tag, head, rel) = line.split('\t') 67 head = int(head) 68 #not required, but useful for inspection 69 node['address'] = count 70 node['word'] = word 71 node['head'] = head 72 node['rel']= rel 73 node['deps'] = [] 74 self.nodelist.append(node) 75 76 for (dep, hd) in temp: 77 #check whether I'm the head for a node with address dep 78 if hd == count: 79 #add dep to my list of dependents 80 node['deps'].append(dep) 81 try: 82 #try to add my address to my head's dependents list 83 self.nodelist[head]['deps'].append(count) 84 except IndexError: 85 #my head hasn't been seen yet, so store the info 86 temp.append((count, head)) 87 88 count += 1 89 90 except ValueError: 91 break 92 93 root_address = self.nodelist[0]['deps'][0] 94 self.root = self.nodelist[root_address] 95 return self
96
97 - def _word(self, node, filter=True):
98 w = node['word'] 99 if filter: 100 if w != ',': return w 101 return w
102
103 - def _deptree(self, i):
104 """ 105 Recursive function for turning dependency graphs into 106 NLTK trees. 107 @type i: C{int} 108 @param i: index of a node in C{nodelist} 109 @return: either a word (if the indexed node 110 is a leaf) or a L{Tree}. 111 """ 112 113 node = self.nodelist[i] 114 word = node['word'] 115 deps = node['deps'] 116 117 if len(deps) == 0: 118 return word 119 else: 120 return Tree(word, [self._deptree(j) for j in deps])
121 122
123 - def deptree(self):
124 """ 125 Starting with the C{root} node, build a dependency tree using the NLTK 126 L{Tree} constructor. Dependency labels are omitted. 127 """ 128 node = self.root 129 word = node['word'] 130 deps = node['deps'] 131 132 return Tree(word, [self._deptree(i) for i in deps])
133
134 - def _hd(self, i):
135 try: 136 return self.nodelist[i]['head'] 137 except IndexError: 138 return None
139
140 - def _rel(self, i):
141 try: 142 return self.nodelist[i]['rel'] 143 except IndexError: 144 return None
145
146 - def nx_graph(self):
147 """ 148 Convert the data in a C{nodelist} into a networkx 149 labeled directed graph. 150 @rtype: C{XDigraph} 151 """ 152 nx_nodelist = range(1, len(self.nodelist)) 153 nx_edgelist = [(n, self._hd(n), self._rel(n)) 154 for n in nx_nodelist if self._hd(n)] 155 self.nx_labels = {} 156 for n in nx_nodelist: 157 self.nx_labels[n] = self.nodelist[n]['word'] 158 159 g = NX.XDiGraph() 160 g.add_nodes_from(nx_nodelist) 161 g.add_edges_from(nx_edgelist) 162 163 return g
164 165 166
167 -def demo(nx=False):
168 """ 169 A demonstration of the result of reading a dependency 170 version of the first sentence of the Penn Treebank. 171 """ 172 dg = DepGraph().read("""Pierre NNP 2 NMOD 173 Vinken NNP 8 SUB 174 , , 2 P 175 61 CD 5 NMOD 176 years NNS 6 AMOD 177 old JJ 2 NMOD 178 , , 2 P 179 will MD 0 ROOT 180 join VB 8 VC 181 the DT 11 NMOD 182 board NN 9 OBJ 183 as IN 9 VMOD 184 a DT 15 NMOD 185 nonexecutive JJ 15 NMOD 186 director NN 12 PMOD 187 Nov. NNP 9 VMOD 188 29 CD 16 NMOD 189 . . 9 VMOD 190 """) 191 tree = dg.deptree() 192 print tree.pp() 193 if nx: 194 #currently doesn't work 195 try: 196 import networkx as NX 197 import pylab as P 198 except ImportError: 199 raise 200 g = dg.nx_graph() 201 g.info() 202 pos = NX.spring_layout(g, dim=1) 203 NX.draw_networkx_nodes(g, pos, node_size=50) 204 #NX.draw_networkx_edges(g, pos, edge_color='k', width=8) 205 NX.draw_networkx_labels(g, pos, dg.nx_labels) 206 P.xticks([]) 207 P.yticks([]) 208 P.savefig('deptree.png') 209 P.show()
210 211 if __name__ == '__main__': 212 demo() 213